chore: evals

2026-05-19 18:45:15 +02:00 · 2026-05-13 14:02:26 -07:00 · 2026-05-13 14:02:26 -07:00 · 3737118050
commit 3737118050
parent 2402b730fa
122 changed files with 22598 additions and 13 deletions
--- a/surfsense_evals/tests/init.py
+++ b/surfsense_evals/tests/init.py
@ -0,0 +1 @@
+
--- a/surfsense_evals/tests/conftest.py
+++ b/surfsense_evals/tests/conftest.py
@ -0,0 +1,34 @@
+"""Shared pytest fixtures for surfsense-evals."""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.core.config import Config
+
+
+@pytest.fixture
+def tmp_env(monkeypatch, tmp_path: Path) -> Path:
+    """Isolate env vars + filesystem state per test.
+
+    Wipes every ``SURFSENSE_*`` / ``OPENROUTER_*`` / ``EVAL_*`` var so a
+    test that wants a specific credential mode can ``monkeypatch.setenv``
+    just what it needs without leakage from the caller's shell.
+    """
+
+    for key in list(os.environ):
+        if key.startswith(("SURFSENSE_", "OPENROUTER_", "EVAL_")):
+            monkeypatch.delenv(key, raising=False)
+    monkeypatch.setenv("EVAL_DATA_DIR", str(tmp_path / "data"))
+    monkeypatch.setenv("EVAL_REPORTS_DIR", str(tmp_path / "reports"))
+    return tmp_path
+
+
+@pytest.fixture
+def isolated_config(tmp_env: Path) -> Config:  # noqa: ARG001
+    from surfsense_evals.core.config import load_config
+
+    return load_config()
--- a/surfsense_evals/tests/core/init.py
+++ b/surfsense_evals/tests/core/init.py
@ -0,0 +1 @@
+
--- a/surfsense_evals/tests/core/test_auth.py
+++ b/surfsense_evals/tests/core/test_auth.py
@ -0,0 +1,95 @@
+"""Auth credential resolution + 401 refresh hook."""
+
+from __future__ import annotations
+
+import httpx
+import pytest
+import respx
+
+from surfsense_evals.core.auth import (
+    CredentialError,
+    acquire_token,
+    client_with_auth,
+)
+from surfsense_evals.core.config import Config
+
+
+def _make_config(**overrides) -> Config:
+    base = {
+        "surfsense_api_base": "http://test",
+        "openrouter_api_key": None,
+        "openrouter_base_url": "https://openrouter.ai/api/v1",
+        "surfsense_jwt": None,
+        "surfsense_refresh_token": None,
+        "surfsense_user_email": None,
+        "surfsense_user_password": None,
+        "data_dir": None,
+        "reports_dir": None,
+    }
+    base.update(overrides)
+    # Path objects required by Config; tests don't touch the FS.
+    from pathlib import Path
+
+    base["data_dir"] = base["data_dir"] or Path("/tmp/eval_test_data")
+    base["reports_dir"] = base["reports_dir"] or Path("/tmp/eval_test_reports")
+    return Config(**base)
+
+
+@pytest.mark.asyncio
+async def test_acquire_token_jwt_mode_short_circuits():
+    config = _make_config(surfsense_jwt="abc", surfsense_refresh_token="ref")
+    bundle = await acquire_token(config)
+    assert bundle.access_token == "abc"
+    assert bundle.refresh_token == "ref"
+    assert bundle.mode == "jwt"
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_acquire_token_local_mode_posts_form():
+    respx.post("http://test/auth/jwt/login").mock(
+        return_value=httpx.Response(
+            200, json={"access_token": "T", "refresh_token": "R", "token_type": "bearer"}
+        )
+    )
+    config = _make_config(
+        surfsense_user_email="u@example.com", surfsense_user_password="pw"
+    )
+    bundle = await acquire_token(config)
+    assert bundle.access_token == "T"
+    assert bundle.refresh_token == "R"
+    assert bundle.mode == "local"
+
+
+@pytest.mark.asyncio
+async def test_acquire_token_no_credentials():
+    config = _make_config()
+    with pytest.raises(CredentialError) as exc:
+        await acquire_token(config)
+    assert "SURFSENSE_USER_EMAIL" in str(exc.value)
+    assert "SURFSENSE_JWT" in str(exc.value)
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_client_with_auth_refreshes_on_401():
+    config = _make_config(surfsense_jwt="old", surfsense_refresh_token="ref")
+    bundle = await acquire_token(config)
+
+    respx.post("http://test/auth/jwt/refresh").mock(
+        return_value=httpx.Response(200, json={"access_token": "new", "refresh_token": "ref2"})
+    )
+    # First call returns 401; the retry (post-refresh) returns 200.
+    respx.get("http://test/api/v1/searchspaces").mock(
+        side_effect=[
+            httpx.Response(401, json={"detail": "expired"}),
+            httpx.Response(200, json=[]),
+        ]
+    )
+
+    async with client_with_auth(config, bundle) as client:
+        response = await client.get("http://test/api/v1/searchspaces")
+
+    assert response.status_code == 200
+    assert bundle.access_token == "new"
+    assert bundle.refresh_token == "ref2"
--- a/surfsense_evals/tests/core/test_clients.py
+++ b/surfsense_evals/tests/core/test_clients.py
@ -0,0 +1,262 @@
+"""respx-mocked tests for the SurfSense HTTP clients."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import httpx
+import pytest
+import respx
+
+from surfsense_evals.core.clients import (
+    DocumentsClient,
+    NewChatClient,
+    SearchSpaceClient,
+)
+from surfsense_evals.core.clients.new_chat import ThreadBusyError
+
+_BASE = "http://test"
+
+
+@pytest.fixture
+def http() -> httpx.AsyncClient:
+    return httpx.AsyncClient(base_url=_BASE)
+
+
+# ---------------------------------------------------------------------------
+# SearchSpaceClient
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_create_search_space_returns_row(respx_mock, http):
+    respx_mock.post("/api/v1/searchspaces").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "id": 99,
+                "name": "eval-medical-2026",
+                "description": None,
+                "user_id": "user-x",
+                "citations_enabled": True,
+                "qna_custom_instructions": None,
+            },
+        )
+    )
+    client = SearchSpaceClient(http, _BASE)
+    row = await client.create("eval-medical-2026")
+    assert row.id == 99
+    assert row.name == "eval-medical-2026"
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_delete_search_space_idempotent_on_404(respx_mock, http):
+    respx_mock.delete("/api/v1/searchspaces/42").mock(
+        return_value=httpx.Response(404, json={"detail": "gone"})
+    )
+    client = SearchSpaceClient(http, _BASE)
+    await client.delete(42)  # must not raise
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_set_llm_preferences_partial_update(respx_mock, http):
+    route = respx_mock.put("/api/v1/search-spaces/42/llm-preferences").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "agent_llm_id": -10042,
+                "document_summary_llm_id": None,
+                "image_generation_config_id": None,
+                "vision_llm_config_id": None,
+                "agent_llm": {
+                    "id": -10042,
+                    "provider": "OPENROUTER",
+                    "model_name": "anthropic/claude-sonnet-4.5",
+                },
+            },
+        )
+    )
+    client = SearchSpaceClient(http, _BASE)
+    prefs = await client.set_llm_preferences(42, agent_llm_id=-10042)
+    assert prefs.agent_llm_id == -10042
+    assert prefs.agent_llm["provider"] == "OPENROUTER"
+    sent_body = json.loads(route.calls[-1].request.content)
+    assert sent_body == {"agent_llm_id": -10042}
+
+
+# ---------------------------------------------------------------------------
+# DocumentsClient
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_documents_status_parses_state(respx_mock, http):
+    respx_mock.get("/api/v1/documents/status").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "items": [
+                    {"id": 1, "title": "a.pdf", "document_type": "FILE",
+                     "status": {"state": "ready", "reason": None}},
+                    {"id": 2, "title": "b.pdf", "document_type": "FILE",
+                     "status": {"state": "failed", "reason": "ETL boom"}},
+                ]
+            },
+        )
+    )
+    client = DocumentsClient(http, _BASE)
+    statuses = await client.get_status(search_space_id=1, document_ids=[1, 2])
+    assert {s.document_id for s in statuses} == {1, 2}
+    assert {s.is_ready for s in statuses} == {True, False}
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_documents_upload_returns_payload(respx_mock, http, tmp_path: Path):
+    f1 = tmp_path / "a.pdf"
+    f1.write_bytes(b"%PDF-1.4 small")
+    respx_mock.post("/api/v1/documents/fileupload").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "message": "Files uploaded",
+                "document_ids": [101],
+                "duplicate_document_ids": [],
+                "total_files": 1,
+                "pending_files": 1,
+                "skipped_duplicates": 0,
+            },
+        )
+    )
+    client = DocumentsClient(http, _BASE)
+    result = await client.upload(files=[f1], search_space_id=7)
+    assert result.document_ids == [101]
+    assert result.pending_files == 1
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_documents_list_chunks_paginated(respx_mock, http):
+    respx_mock.get("/api/v1/documents/5/chunks").mock(
+        side_effect=[
+            httpx.Response(200, json={
+                "items": [{"id": 1, "content": "a"}, {"id": 2, "content": "b"}],
+                "total": 3, "page": 0, "page_size": 2, "has_more": True,
+            }),
+            httpx.Response(200, json={
+                "items": [{"id": 3, "content": "c"}],
+                "total": 3, "page": 1, "page_size": 2, "has_more": False,
+            }),
+        ]
+    )
+    client = DocumentsClient(http, _BASE)
+    rows = await client.list_chunks(5, page_size=2)
+    assert [r.id for r in rows] == [1, 2, 3]
+
+
+# ---------------------------------------------------------------------------
+# NewChatClient
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_create_thread_returns_id(respx_mock, http):
+    respx_mock.post("/api/v1/threads").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "id": 555,
+                "title": "eval",
+                "archived": False,
+                "visibility": "PRIVATE",
+                "search_space_id": 1,
+                "messages": [],
+                "created_at": "2026-05-11T00:00:00Z",
+                "updated_at": "2026-05-11T00:00:00Z",
+            },
+        )
+    )
+    client = NewChatClient(http, _BASE)
+    tid = await client.create_thread(search_space_id=1)
+    assert tid == 555
+
+
+def _sse_body(events: list[dict]) -> bytes:
+    parts = []
+    for ev in events:
+        parts.append(f"data: {json.dumps(ev)}\n\n")
+    parts.append("data: [DONE]\n\n")
+    return "".join(parts).encode("utf-8")
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_ask_accumulates_text_deltas(respx_mock, http):
+    body = _sse_body([
+        {"type": "start", "messageId": "m1"},
+        {"type": "text-start", "id": "t1"},
+        {"type": "text-delta", "id": "t1", "delta": "Answer "},
+        {"type": "text-delta", "id": "t1", "delta": "is "},
+        {"type": "text-delta", "id": "t1", "delta": "B [citation:42]."},
+        {"type": "text-end", "id": "t1"},
+        {"type": "finish"},
+    ])
+    respx_mock.post("/api/v1/new_chat").mock(
+        return_value=httpx.Response(
+            200,
+            content=body,
+            headers={"Content-Type": "text/event-stream"},
+        )
+    )
+    client = NewChatClient(http, _BASE)
+    answer = await client.ask(
+        thread_id=1, search_space_id=2, user_query="What is the answer?"
+    )
+    assert answer.text == "Answer is B [citation:42]."
+    assert answer.finished_normally is True
+    assert any(c["chunk_id"] == 42 for c in answer.citations)
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_ask_409_thread_busy_retries(respx_mock, http):
+    body = _sse_body([
+        {"type": "text-delta", "id": "t1", "delta": "ok"},
+        {"type": "finish"},
+    ])
+    busy = httpx.Response(
+        409,
+        json={"detail": {"errorCode": "THREAD_BUSY", "message": "busy"}},
+        headers={"Retry-After": "1"},
+    )
+    success = httpx.Response(
+        200, content=body, headers={"Content-Type": "text/event-stream"}
+    )
+    respx_mock.post("/api/v1/new_chat").mock(side_effect=[busy, success])
+    client = NewChatClient(http, _BASE)
+    answer = await client.ask(
+        thread_id=1, search_space_id=2, user_query="hi", max_busy_retries=2
+    )
+    assert answer.text == "ok"
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_ask_409_exhausts_retries(respx_mock, http):
+    busy = httpx.Response(
+        409,
+        json={"detail": {"errorCode": "TURN_CANCELLING", "message": "wait"}},
+        headers={"Retry-After": "1"},
+    )
+    respx_mock.post("/api/v1/new_chat").mock(return_value=busy)
+    client = NewChatClient(http, _BASE)
+    with pytest.raises(ThreadBusyError):
+        await client.ask(
+            thread_id=1, search_space_id=2, user_query="hi", max_busy_retries=1
+        )
--- a/surfsense_evals/tests/core/test_config.py
+++ b/surfsense_evals/tests/core/test_config.py
@ -0,0 +1,160 @@
+"""Tests for env loading + state.json read/write."""
+
+from __future__ import annotations
+
+import json
+
+from surfsense_evals.core.config import (
+    DEFAULT_SCENARIO,
+    SCENARIOS,
+    SuiteState,
+    clear_suite_state,
+    get_suite_state,
+    load_config,
+    set_suite_state,
+)
+
+
+def test_load_config_defaults_to_localhost(tmp_env):  # noqa: ARG001
+    config = load_config()
+    assert config.surfsense_api_base == "http://localhost:8000"
+    assert config.has_jwt_mode() is False
+    assert config.has_local_mode() is False
+    assert config.credential_mode() == "none"
+
+
+def test_load_config_picks_up_jwt_env(tmp_env, monkeypatch):  # noqa: ARG001
+    monkeypatch.setenv("SURFSENSE_JWT", "tok")
+    config = load_config()
+    assert config.credential_mode() == "jwt"
+
+
+def test_load_config_picks_up_local_env(tmp_env, monkeypatch):  # noqa: ARG001
+    monkeypatch.setenv("SURFSENSE_USER_EMAIL", "u@x.com")
+    monkeypatch.setenv("SURFSENSE_USER_PASSWORD", "pw")
+    config = load_config()
+    assert config.credential_mode() == "local"
+
+
+def test_state_roundtrip_per_suite(tmp_env):  # noqa: ARG001
+    config = load_config()
+    assert get_suite_state(config, "medical") is None
+    state = SuiteState(
+        search_space_id=1,
+        agent_llm_id=-10042,
+        provider_model="anthropic/claude-sonnet-4.5",
+        created_at="2026-05-11T20-30-00Z",
+    )
+    set_suite_state(config, "medical", state)
+    legal = SuiteState(
+        search_space_id=2,
+        agent_llm_id=-1,
+        provider_model="openai/gpt-5",
+        created_at="2026-05-11T21-00-00Z",
+    )
+    set_suite_state(config, "legal", legal)
+
+    fetched = get_suite_state(config, "medical")
+    assert fetched.search_space_id == 1
+    assert fetched.provider_model == "anthropic/claude-sonnet-4.5"
+
+    # Other suite untouched after teardown.
+    cleared = clear_suite_state(config, "medical")
+    assert cleared is True
+    assert get_suite_state(config, "medical") is None
+    assert get_suite_state(config, "legal").search_space_id == 2
+
+    raw = json.loads(config.state_path.read_text(encoding="utf-8"))
+    assert "medical" not in raw["suites"]
+    assert "legal" in raw["suites"]
+
+
+def test_paths_are_per_suite(tmp_env):  # noqa: ARG001
+    config = load_config()
+    a = config.suite_data_dir("medical")
+    b = config.suite_data_dir("legal")
+    assert a != b
+    assert config.suite_reports_dir("medical").parent == config.reports_dir
+    assert config.suite_runs_dir("medical").name == "runs"
+    assert config.suite_maps_dir("medical").name == "maps"
+
+
+# ---------------------------------------------------------------------------
+# Scenario state — back-compat + new fields
+# ---------------------------------------------------------------------------
+
+
+def test_legacy_state_back_compat_defaults_to_head_to_head():
+    """state.json files written before scenarios shipped must still load.
+
+    Missing ``scenario`` / ``vision_*`` / ``native_arm_model`` keys all
+    default to ``head-to-head`` / ``None`` so old setups keep working
+    after upgrade — the runner's behaviour exactly mirrors the legacy
+    one (both arms answer with ``provider_model``).
+    """
+
+    legacy = {
+        "search_space_id": 7,
+        "agent_llm_id": -123,
+        "provider_model": "anthropic/claude-sonnet-4.5",
+        "created_at": "2026-05-11T20-30-00Z",
+        "ingestion_maps": {},
+    }
+    state = SuiteState.from_dict(legacy)
+    assert state.scenario == DEFAULT_SCENARIO == "head-to-head"
+    assert state.vision_llm_config_id is None
+    assert state.vision_provider_model is None
+    assert state.native_arm_model is None
+    # The native arm should still answer with the same slug as SurfSense.
+    assert state.effective_native_arm_model == state.provider_model
+
+
+def test_unknown_scenario_falls_back_to_default():
+    """Garbage scenario in state.json → default, not crash.
+
+    Defensive: we'd rather a stale state file render with the safe
+    head-to-head behaviour than break the whole run with a KeyError.
+    """
+
+    payload = {
+        "search_space_id": 1,
+        "agent_llm_id": -1,
+        "provider_model": "openai/gpt-5",
+        "scenario": "unknown-scenario-name",
+    }
+    state = SuiteState.from_dict(payload)
+    assert state.scenario == DEFAULT_SCENARIO
+
+
+def test_cost_arbitrage_state_persists_native_arm_model(tmp_env):  # noqa: ARG001
+    config = load_config()
+    state = SuiteState(
+        search_space_id=42,
+        agent_llm_id=-1,
+        provider_model="openai/gpt-5.4-mini",
+        created_at="2026-05-11T20-30-00Z",
+        scenario="cost-arbitrage",
+        vision_llm_config_id=-101,
+        vision_provider_model="anthropic/claude-sonnet-4.5",
+        native_arm_model="anthropic/claude-sonnet-4.5",
+    )
+    set_suite_state(config, "medical", state)
+
+    fetched = get_suite_state(config, "medical")
+    assert fetched.scenario == "cost-arbitrage"
+    assert fetched.vision_llm_config_id == -101
+    assert fetched.vision_provider_model == "anthropic/claude-sonnet-4.5"
+    assert fetched.native_arm_model == "anthropic/claude-sonnet-4.5"
+    # Cost arbitrage's whole point: native arm slug != surfsense slug.
+    assert fetched.effective_native_arm_model != fetched.provider_model
+    assert fetched.effective_native_arm_model == "anthropic/claude-sonnet-4.5"
+
+    raw = json.loads(config.state_path.read_text(encoding="utf-8"))
+    assert raw["suites"]["medical"]["scenario"] == "cost-arbitrage"
+
+
+def test_scenario_constants_are_stable():
+    """Pin the public scenario list; runners + tests key off these strings."""
+
+    assert SCENARIOS == ("head-to-head", "symmetric-cheap", "cost-arbitrage")
+    assert DEFAULT_SCENARIO == "head-to-head"
--- a/surfsense_evals/tests/core/test_ingest_settings.py
+++ b/surfsense_evals/tests/core/test_ingest_settings.py
@ -0,0 +1,269 @@
+"""Unit tests for ``surfsense_evals.core.ingest_settings``.
+
+Covers:
+
+* ``IngestSettings.merge`` honours operator overrides and falls back
+  to per-benchmark defaults when the operator is silent.
+* ``add_ingest_settings_args`` exposes the three flag pairs and
+  argparse defaults of ``None`` correctly distinguish "not passed"
+  from "explicitly false".
+* ``settings_header_line`` / ``read_settings_header`` round-trip
+  through a JSONL file.
+* ``read_settings_header`` is fault-tolerant: missing files, missing
+  header, malformed JSON.
+* ``format_ingest_settings_md`` produces a stable Markdown bullet.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.core.ingest_settings import (
+    PROCESSING_MODES,
+    SETTINGS_HEADER_KEY,
+    IngestSettings,
+    add_ingest_settings_args,
+    format_ingest_settings_md,
+    is_settings_header,
+    read_settings_header,
+    settings_header_line,
+)
+
+# ---------------------------------------------------------------------------
+# IngestSettings.merge
+# ---------------------------------------------------------------------------
+
+
+class TestMerge:
+    def test_silent_operator_uses_defaults(self) -> None:
+        defaults = IngestSettings(use_vision_llm=True, processing_mode="basic", should_summarize=True)
+        merged = IngestSettings.merge(defaults, {})
+        assert merged == defaults
+
+    def test_explicit_false_overrides_default_true(self) -> None:
+        defaults = IngestSettings(use_vision_llm=True)
+        merged = IngestSettings.merge(
+            defaults, {"use_vision_llm": False}
+        )
+        assert merged.use_vision_llm is False
+
+    def test_explicit_true_overrides_default_false(self) -> None:
+        defaults = IngestSettings(use_vision_llm=False)
+        merged = IngestSettings.merge(
+            defaults, {"use_vision_llm": True}
+        )
+        assert merged.use_vision_llm is True
+
+    def test_none_means_silent(self) -> None:
+        # Argparse with BooleanOptionalAction yields None when the
+        # operator passed neither --use-vision-llm nor --no-vision-llm.
+        defaults = IngestSettings(use_vision_llm=True)
+        merged = IngestSettings.merge(
+            defaults, {"use_vision_llm": None}
+        )
+        assert merged.use_vision_llm is True
+
+    def test_processing_mode_override(self) -> None:
+        defaults = IngestSettings(processing_mode="basic")
+        merged = IngestSettings.merge(
+            defaults, {"processing_mode": "premium"}
+        )
+        assert merged.processing_mode == "premium"
+
+    def test_processing_mode_invalid_raises(self) -> None:
+        defaults = IngestSettings(processing_mode="basic")
+        with pytest.raises(ValueError, match="Invalid processing_mode"):
+            IngestSettings.merge(defaults, {"processing_mode": "exotic"})
+
+    def test_processing_mode_blank_falls_back(self) -> None:
+        defaults = IngestSettings(processing_mode="basic")
+        merged = IngestSettings.merge(defaults, {"processing_mode": ""})
+        assert merged.processing_mode == "basic"
+
+    def test_string_truthy_coerced(self) -> None:
+        defaults = IngestSettings(use_vision_llm=False)
+        merged = IngestSettings.merge(defaults, {"use_vision_llm": "yes"})
+        assert merged.use_vision_llm is True
+
+    def test_string_falsy_coerced(self) -> None:
+        defaults = IngestSettings(use_vision_llm=True)
+        merged = IngestSettings.merge(defaults, {"use_vision_llm": "false"})
+        assert merged.use_vision_llm is False
+
+    def test_other_keys_ignored(self) -> None:
+        # Benchmarks pass the whole opts dict; merge must tolerate
+        # unrelated keys without crashing.
+        defaults = IngestSettings(use_vision_llm=True, processing_mode="basic")
+        merged = IngestSettings.merge(
+            defaults,
+            {
+                "use_vision_llm": False,
+                "concurrency": 4,
+                "task_filter": "all",
+                "no_mentions": True,
+            },
+        )
+        assert merged.use_vision_llm is False
+        assert merged.processing_mode == "basic"
+
+    def test_to_dict_round_trips(self) -> None:
+        s = IngestSettings(use_vision_llm=True, processing_mode="premium", should_summarize=False)
+        d = s.to_dict()
+        assert d == {
+            "use_vision_llm": True,
+            "processing_mode": "premium",
+            "should_summarize": False,
+        }
+
+    def test_render_label_format(self) -> None:
+        s = IngestSettings(use_vision_llm=True, processing_mode="premium", should_summarize=True)
+        assert s.render_label() == "vision=on, mode=premium, summarize=on"
+
+
+# ---------------------------------------------------------------------------
+# add_ingest_settings_args
+# ---------------------------------------------------------------------------
+
+
+class TestAddArgs:
+    @pytest.fixture
+    def parser(self) -> argparse.ArgumentParser:
+        p = argparse.ArgumentParser()
+        add_ingest_settings_args(
+            p,
+            defaults=IngestSettings(
+                use_vision_llm=False, processing_mode="basic", should_summarize=False
+            ),
+        )
+        return p
+
+    def test_silent_invocation_yields_none(self, parser: argparse.ArgumentParser) -> None:
+        args = parser.parse_args([])
+        assert args.use_vision_llm is None
+        assert args.processing_mode is None
+        assert args.should_summarize is None
+
+    def test_use_vision_llm_flag(self, parser: argparse.ArgumentParser) -> None:
+        args = parser.parse_args(["--use-vision-llm"])
+        assert args.use_vision_llm is True
+
+    def test_no_vision_llm_flag(self, parser: argparse.ArgumentParser) -> None:
+        args = parser.parse_args(["--no-vision-llm"])
+        assert args.use_vision_llm is False
+
+    def test_processing_mode_choices(self, parser: argparse.ArgumentParser) -> None:
+        for mode in PROCESSING_MODES:
+            args = parser.parse_args(["--processing-mode", mode])
+            assert args.processing_mode == mode
+
+    def test_processing_mode_rejects_unknown(
+        self, parser: argparse.ArgumentParser
+    ) -> None:
+        with pytest.raises(SystemExit):
+            parser.parse_args(["--processing-mode", "exotic"])
+
+    def test_summarize_flag_pair(self, parser: argparse.ArgumentParser) -> None:
+        on = parser.parse_args(["--should-summarize"])
+        assert on.should_summarize is True
+        off = parser.parse_args(["--no-summarize"])
+        assert off.should_summarize is False
+
+    def test_vision_flags_mutually_exclusive(
+        self, parser: argparse.ArgumentParser
+    ) -> None:
+        with pytest.raises(SystemExit):
+            parser.parse_args(["--use-vision-llm", "--no-vision-llm"])
+
+    def test_full_pipeline(self, parser: argparse.ArgumentParser) -> None:
+        # Operator passes flags + defaults are reasonable. Merge
+        # should yield exactly what they asked for.
+        args = parser.parse_args(
+            ["--use-vision-llm", "--processing-mode", "premium"]
+        )
+        defaults = IngestSettings(
+            use_vision_llm=False, processing_mode="basic", should_summarize=False
+        )
+        merged = IngestSettings.merge(defaults, vars(args))
+        assert merged == IngestSettings(
+            use_vision_llm=True, processing_mode="premium", should_summarize=False
+        )
+
+
+# ---------------------------------------------------------------------------
+# Header round-trip + read_settings_header fault tolerance
+# ---------------------------------------------------------------------------
+
+
+class TestHeader:
+    def test_header_line_round_trip(self, tmp_path: Path) -> None:
+        s = IngestSettings(use_vision_llm=True, processing_mode="premium")
+        path = tmp_path / "map.jsonl"
+        with path.open("w", encoding="utf-8") as fh:
+            fh.write(settings_header_line(s) + "\n")
+            fh.write(json.dumps({"case_id": "x", "document_id": 1}) + "\n")
+        loaded = read_settings_header(path)
+        assert loaded == s.to_dict()
+
+    def test_is_settings_header_recognises(self) -> None:
+        assert is_settings_header({SETTINGS_HEADER_KEY: {}})
+        assert not is_settings_header({"case_id": "x"})
+
+    def test_missing_file_returns_empty(self, tmp_path: Path) -> None:
+        assert read_settings_header(tmp_path / "does_not_exist.jsonl") == {}
+
+    def test_empty_file_returns_empty(self, tmp_path: Path) -> None:
+        path = tmp_path / "empty.jsonl"
+        path.write_text("", encoding="utf-8")
+        assert read_settings_header(path) == {}
+
+    def test_no_header_returns_empty(self, tmp_path: Path) -> None:
+        path = tmp_path / "legacy.jsonl"
+        with path.open("w", encoding="utf-8") as fh:
+            fh.write(json.dumps({"case_id": "x", "document_id": 1}) + "\n")
+            fh.write(json.dumps({"case_id": "y", "document_id": 2}) + "\n")
+        assert read_settings_header(path) == {}
+
+    def test_malformed_json_returns_empty(self, tmp_path: Path) -> None:
+        path = tmp_path / "broken.jsonl"
+        path.write_text("not json\n", encoding="utf-8")
+        assert read_settings_header(path) == {}
+
+    def test_skips_blank_first_lines(self, tmp_path: Path) -> None:
+        s = IngestSettings(use_vision_llm=True)
+        path = tmp_path / "padded.jsonl"
+        with path.open("w", encoding="utf-8") as fh:
+            fh.write("\n\n")
+            fh.write(settings_header_line(s) + "\n")
+        assert read_settings_header(path) == s.to_dict()
+
+
+# ---------------------------------------------------------------------------
+# format_ingest_settings_md
+# ---------------------------------------------------------------------------
+
+
+class TestFormatMd:
+    def test_full_settings(self) -> None:
+        out = format_ingest_settings_md(
+            {"use_vision_llm": True, "processing_mode": "premium", "should_summarize": True}
+        )
+        assert "vision_llm=`on`" in out
+        assert "processing_mode=`premium`" in out
+        assert "summarize=`on`" in out
+
+    def test_default_off(self) -> None:
+        out = format_ingest_settings_md(
+            {"use_vision_llm": False, "processing_mode": "basic", "should_summarize": False}
+        )
+        assert "vision_llm=`off`" in out
+        assert "processing_mode=`basic`" in out
+        assert "summarize=`off`" in out
+
+    def test_missing_returns_re_ingest_hint(self) -> None:
+        # Empty dict + None + non-mapping should all degrade gracefully.
+        for raw in [None, {}, "not-a-mapping"]:
+            assert "(not recorded" in format_ingest_settings_md(raw)
--- a/surfsense_evals/tests/core/test_metrics.py
+++ b/surfsense_evals/tests/core/test_metrics.py
@ -0,0 +1,153 @@
+"""Metric correctness — Wilson, McNemar, retrieval scores."""
+
+from __future__ import annotations
+
+import math
+
+import pytest
+
+from surfsense_evals.core.metrics import (
+    accuracy_with_wilson_ci,
+    bootstrap_delta_ci,
+    mcnemar_test,
+    mrr,
+    ndcg_at_k,
+    recall_at_k,
+    score_run,
+    wilson_ci,
+)
+
+# ---------------------------------------------------------------------------
+# Wilson
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "k,n,low,high",
+    [
+        (80, 100, 0.7111, 0.8666),  # cross-checked vs statsmodels.proportion_confint(method='wilson')
+        (50, 100, 0.4038, 0.5962),
+        (0, 0, 0.0, 1.0),
+        (0, 10, 0.0, 0.2775),
+        (10, 10, 0.7225, 1.0),
+    ],
+)
+def test_wilson_ci_known_values(k, n, low, high):
+    result_low, result_high = wilson_ci(k, n)
+    assert math.isclose(result_low, low, abs_tol=5e-4), (k, n, result_low, low)
+    assert math.isclose(result_high, high, abs_tol=5e-4), (k, n, result_high, high)
+
+
+def test_accuracy_with_wilson_ci_object():
+    res = accuracy_with_wilson_ci(70, 100)
+    assert res.accuracy == 0.7
+    assert 0.0 < res.ci_low < res.ci_high < 1.0
+
+
+def test_invalid_inputs_raise():
+    with pytest.raises(ValueError):
+        accuracy_with_wilson_ci(-1, 10)
+    with pytest.raises(ValueError):
+        accuracy_with_wilson_ci(11, 10)
+
+
+# ---------------------------------------------------------------------------
+# McNemar
+# ---------------------------------------------------------------------------
+
+
+def test_mcnemar_degenerate_returns_p_value_one():
+    a = [True, True, False, False]
+    b = [True, True, False, False]
+    res = mcnemar_test(a, b)
+    assert res.b == 0 and res.c == 0
+    assert res.p_value == 1.0
+    assert res.method == "degenerate"
+
+
+def test_mcnemar_exact_branch_strong_signal():
+    """B = 0, C = 10 → exact two-sided binomial p == 2 * (1/2)**10."""
+
+    a = [True] * 10 + [False] * 10
+    b = [True] * 10 + [True] * 10  # surfsense beats native on the 10 native-wrong
+    res = mcnemar_test(a, b)
+    assert res.b == 0
+    assert res.c == 10
+    assert res.method == "exact"
+    expected = 2 * (0.5 ** 10)
+    assert math.isclose(res.p_value, expected, rel_tol=1e-9)
+
+
+def test_mcnemar_chi_square_approx_for_large_discordant():
+    # Construct b=15, c=5 with continuity-corrected chi^2 = (|10|-1)^2/20 = 4.05.
+    a = [True] * 15 + [False] * 5 + [True] * 30 + [False] * 30
+    b = [False] * 15 + [True] * 5 + [True] * 30 + [False] * 30
+    res = mcnemar_test(a, b)
+    assert res.method == "chi2_cc"
+    assert res.b == 15 and res.c == 5
+    assert math.isclose(res.statistic, ((abs(15 - 5) - 1) ** 2) / 20.0, rel_tol=1e-9)
+    # p ≈ chi2.sf(4.05, df=1) ≈ 0.04417
+    assert 0.04 < res.p_value < 0.05
+
+
+def test_mcnemar_length_mismatch():
+    with pytest.raises(ValueError):
+        mcnemar_test([True], [True, False])
+
+
+# ---------------------------------------------------------------------------
+# Bootstrap
+# ---------------------------------------------------------------------------
+
+
+def test_bootstrap_delta_ci_shape_and_determinism():
+    a = [True, True, False, True, False, False, True, True]
+    b = [True, True, True, True, True, False, True, False]
+    res1 = bootstrap_delta_ci(a, b, n_resamples=500, random_state=42)
+    res2 = bootstrap_delta_ci(a, b, n_resamples=500, random_state=42)
+    assert res1.delta == res2.delta
+    assert res1.ci_low == res2.ci_low
+    assert res1.ci_high == res2.ci_high
+    assert res1.ci_low <= res1.delta <= res1.ci_high
+    assert res1.n_resamples == 500
+
+
+# ---------------------------------------------------------------------------
+# Retrieval
+# ---------------------------------------------------------------------------
+
+
+def test_recall_at_k():
+    retrieved = ["a", "b", "c", "d"]
+    relevant = ["b", "d", "z"]
+    assert recall_at_k(retrieved, relevant, k=2) == pytest.approx(1 / 3)
+    assert recall_at_k(retrieved, relevant, k=4) == pytest.approx(2 / 3)
+
+
+def test_mrr():
+    assert mrr(["a", "b", "c"], ["c"]) == pytest.approx(1 / 3)
+    assert mrr(["x", "y"], ["z"]) == 0.0
+
+
+def test_ndcg_at_k_perfect_order():
+    qrels = {"a": 2, "b": 1}
+    assert ndcg_at_k(["a", "b"], qrels, k=2) == pytest.approx(1.0)
+
+
+def test_ndcg_at_k_irrelevant_first():
+    qrels = {"a": 2, "b": 1}
+    # Wrong order should still be > 0 but < 1
+    val = ndcg_at_k(["c", "a", "b"], qrels, k=3)
+    assert 0 < val < 1
+
+
+def test_score_run_aggregates_across_queries():
+    scores = score_run(
+        per_query_retrieved={"q1": ["a", "b"], "q2": ["x", "y", "z"]},
+        per_query_qrels={"q1": {"a": 1}, "q2": {"z": 2}},
+        ks=(1, 5),
+        ndcg_k=5,
+    )
+    assert scores.n_queries == 2
+    assert scores.recall_at_k[1] == pytest.approx((1 + 0) / 2)  # q1 hits @1, q2 doesn't
+    assert scores.mrr == pytest.approx((1.0 + 1 / 3) / 2)
--- a/surfsense_evals/tests/core/test_parse_answer_letter.py
+++ b/surfsense_evals/tests/core/test_parse_answer_letter.py
@ -0,0 +1,27 @@
+"""Tests for the MCQ answer-letter extractor."""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.core.parse import extract_answer_letter
+from surfsense_evals.core.parse.answer_letter import AnswerLetterResult
+
+
+@pytest.mark.parametrize(
+    "text,expected_letter,expected_strategy",
+    [
+        ('```json\n{"step_by_step_thinking": "...", "answer_choice": "B"}\n```', "B", "json_envelope"),
+        ('Reasoning... {"step_by_step_thinking": "x", "answer_choice": "C"}', "C", "json_envelope"),
+        ("Long reasoning.\nAnswer: D", "D", "answer_line"),
+        ("The correct answer is (A).", "A", "answer_line"),
+        ("Final answer: e", "E", "answer_line"),
+        ("Long reasoning.\n\nB", "B", "bare_letter"),
+        ("Long reasoning.\n\n(C).", "C", "bare_letter"),
+        ("", None, "none"),
+        ("Just narrative without an answer.", None, "none"),
+    ],
+)
+def test_extract_answer_letter(text, expected_letter, expected_strategy):
+    result = extract_answer_letter(text)
+    assert result == AnswerLetterResult(expected_letter, expected_strategy)
--- a/surfsense_evals/tests/core/test_parse_citations.py
+++ b/surfsense_evals/tests/core/test_parse_citations.py
@ -0,0 +1,108 @@
+"""Parity tests for the citation regex.
+
+Each row mirrors a case from the canonical TS reference at
+``surfsense_web/lib/citations/citation-parser.ts``. If a future PR
+loosens or tightens the TS regex, these tests will start failing;
+that's the explicit signal to re-port the change.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.core.parse import (
+    CITATION_REGEX,
+    ChunkCitation,
+    UrlCitation,
+    parse_citations,
+)
+
+PARITY_TABLE = [
+    # (input, expected number of matches, expected first-token kind/value)
+    ("Plain text with no citation.", 0, None),
+    (
+        "The patient has fever [citation:42] and cough.",
+        1,
+        ChunkCitation(chunk_id=42, is_docs_chunk=False),
+    ),
+    (
+        "Negative chunk ids work [citation:-7].",
+        1,
+        ChunkCitation(chunk_id=-7, is_docs_chunk=False),
+    ),
+    (
+        "doc-prefix [citation:doc-12].",
+        1,
+        ChunkCitation(chunk_id=12, is_docs_chunk=True),
+    ),
+    (
+        "Multi id [citation:1, doc-2, -3].",
+        3,
+        ChunkCitation(chunk_id=1, is_docs_chunk=False),
+    ),
+    (
+        "URL form [citation:https://x.com/a].",
+        1,
+        UrlCitation(url="https://x.com/a"),
+    ),
+    (
+        "Chinese brackets【citation:5】.",
+        1,
+        ChunkCitation(chunk_id=5, is_docs_chunk=False),
+    ),
+    (
+        "ZWSP-decorated [\u200bcitation:9\u200b].",
+        1,
+        ChunkCitation(chunk_id=9, is_docs_chunk=False),
+    ),
+    (
+        "Whitespace [citation:  doc-100 ] tolerated.",
+        1,
+        ChunkCitation(chunk_id=100, is_docs_chunk=True),
+    ),
+    (
+        # The TS regex's URL char class excludes ']', so a trailing
+        # bracket isn't swallowed.
+        "Two URLs [citation:https://a.io] and [citation:https://b.io].",
+        2,
+        UrlCitation(url="https://a.io"),
+    ),
+    (
+        # Garbled form should match nothing.
+        "Citation-like but wrong [citation:].",
+        0,
+        None,
+    ),
+]
+
+
+@pytest.mark.parametrize("text,n_expected,first", PARITY_TABLE)
+def test_citation_regex_parity(text: str, n_expected: int, first):
+    tokens = parse_citations(text)
+    assert len(tokens) == n_expected, (text, tokens)
+    if first is not None:
+        assert tokens[0] == first, (text, tokens)
+
+
+def test_regex_pattern_matches_ts_source():
+    """Sanity: the compiled pattern carries the exact alternatives the TS source does."""
+
+    pattern = CITATION_REGEX.pattern
+    assert "https?://" in pattern
+    assert "urlcite" in pattern
+    assert "doc-" in pattern
+    assert "\u200B" in pattern
+    assert "【" in pattern and "】" in pattern
+
+
+def test_url_map_resolution():
+    text = "Inline placeholder [citation:urlcite0]."
+    tokens = parse_citations(text, url_map={"urlcite0": "https://resolved.example/x"})
+    assert tokens == [UrlCitation(url="https://resolved.example/x")]
+
+
+def test_url_map_missing_key_drops_token():
+    """Missing urlcite resolution returns no token (TS behaviour)."""
+
+    text = "[citation:urlcite99]"
+    assert parse_citations(text, url_map={}) == []
--- a/surfsense_evals/tests/core/test_parse_freeform_answer.py
+++ b/surfsense_evals/tests/core/test_parse_freeform_answer.py
@ -0,0 +1,73 @@
+"""Tests for ``surfsense_evals.core.parse.freeform_answer``."""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.core.parse.freeform_answer import extract_freeform_answer
+
+
+class TestExtractFreeformAnswer:
+    def test_empty_string_returns_empty(self) -> None:
+        assert extract_freeform_answer("") == ""
+        assert extract_freeform_answer("   \n\n  ") == ""
+
+    def test_simple_answer_marker(self) -> None:
+        assert extract_freeform_answer("Answer: 42") == "42"
+
+    def test_final_answer_marker(self) -> None:
+        assert extract_freeform_answer("Final answer: Paris") == "Paris"
+
+    def test_the_answer_is_marker(self) -> None:
+        assert extract_freeform_answer("The answer is: not answerable") == "not answerable"
+
+    def test_multiline_picks_last_answer_marker(self) -> None:
+        text = "Let me think...\nAnswer: 5\nAnswer: 7\n"
+        assert extract_freeform_answer(text) == "7"
+
+    def test_falls_back_to_last_nonempty_line(self) -> None:
+        text = "Some thinking here.\n\n42"
+        assert extract_freeform_answer(text) == "42"
+
+    def test_strips_quotes(self) -> None:
+        assert extract_freeform_answer('Answer: "Paris"') == "Paris"
+        assert extract_freeform_answer("Answer: 'Paris'") == "Paris"
+
+    def test_strips_backticks(self) -> None:
+        assert extract_freeform_answer("Answer: `42`") == "42"
+
+    def test_uses_fenced_block_when_no_marker(self) -> None:
+        text = "Here's my response:\n```\nfinal value\n```\n"
+        assert extract_freeform_answer(text) == "final value"
+
+    def test_case_insensitive_markers(self) -> None:
+        assert extract_freeform_answer("ANSWER: yes") == "yes"
+        assert extract_freeform_answer("answer: no") == "no"
+
+    @pytest.mark.parametrize("text,expected", [
+        ("Answer: 1, 2, 3", "1, 2, 3"),
+        ("Answer: 3.14", "3.14"),
+        ("Answer:    spaced   ", "spaced"),
+    ])
+    def test_various_payloads(self, text: str, expected: str) -> None:
+        assert extract_freeform_answer(text) == expected
+
+    def test_inline_answer_after_thinking_trace(self) -> None:
+        # Agent replies sometimes glue their thinking onto the same
+        # line as the final "Answer: ..." marker (no newline before it).
+        # The line-anchored regex misses this; the inline fallback
+        # should still extract the right value.
+        text = (
+            "Need the Charlotte Bronte book title/year and the rank "
+            "for a 128-foot NYC building.Answer: 128th"
+        )
+        assert extract_freeform_answer(text) == "128th"
+
+    def test_inline_picks_last_inline_answer(self) -> None:
+        text = "Thought: maybe Answer: 5 is right? Actually Answer: 7."
+        assert extract_freeform_answer(text) == "7."
+
+    def test_inline_does_not_override_proper_marker(self) -> None:
+        # When a clean line-anchored "Answer: ..." exists, that wins.
+        text = "Some preamble.Answer: 99\nAnswer: 42"
+        assert extract_freeform_answer(text) == "42"
--- a/surfsense_evals/tests/core/test_parse_sse.py
+++ b/surfsense_evals/tests/core/test_parse_sse.py
@ -0,0 +1,84 @@
+"""Tests for the SSE consumer."""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.core.parse import iter_sse_events
+
+
+async def _alist(it):
+    out = []
+    async for x in it:
+        out.append(x)
+    return out
+
+
+async def _astream(lines):
+    for line in lines:
+        yield line
+
+
+@pytest.mark.asyncio
+async def test_basic_data_frame():
+    events = await _alist(
+        iter_sse_events(_astream([
+            'data: {"type": "text-delta", "delta": "hi"}',
+            "",
+            'data: {"type": "finish"}',
+            "",
+        ]))
+    )
+    assert [e.data for e in events] == [
+        '{"type": "text-delta", "delta": "hi"}',
+        '{"type": "finish"}',
+    ]
+
+
+@pytest.mark.asyncio
+async def test_done_sentinel_passes_through():
+    events = await _alist(
+        iter_sse_events(_astream([
+            "data: [DONE]",
+            "",
+        ]))
+    )
+    assert [e.data for e in events] == ["[DONE]"]
+
+
+@pytest.mark.asyncio
+async def test_multiline_data_joins_with_newline():
+    events = await _alist(
+        iter_sse_events(_astream([
+            "data: line1",
+            "data: line2",
+            "",
+        ]))
+    )
+    assert events[0].data == "line1\nline2"
+
+
+@pytest.mark.asyncio
+async def test_comments_and_other_fields_ignored():
+    events = await _alist(
+        iter_sse_events(_astream([
+            ": heartbeat",
+            "event: foo",
+            "id: 123",
+            "data: payload",
+            "",
+        ]))
+    )
+    assert [e.data for e in events] == ["payload"]
+
+
+@pytest.mark.asyncio
+async def test_handles_missing_trailing_blank():
+    """Some servers omit the final blank line; the consumer should still emit."""
+
+    events = await _alist(
+        iter_sse_events(_astream([
+            "data: only-one",
+        ]))
+    )
+    assert [e.data for e in events] == ["only-one"]
--- a/surfsense_evals/tests/core/test_pdf_render.py
+++ b/surfsense_evals/tests/core/test_pdf_render.py
@ -0,0 +1,51 @@
+"""Smoke tests for PDF rendering.
+
+We don't pull a full PDF parser into the test deps; the assertions
+are bytes-level (``%PDF`` magic, deterministic CreationDate scrub).
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from surfsense_evals.core.pdf import render_pdf, render_text_files_to_pdf
+
+
+def test_render_pdf_writes_pdf_with_magic(tmp_path: Path):
+    out = tmp_path / "out.pdf"
+    rendered = render_pdf(
+        title="Test",
+        sections=[("intro", "Hello world."), ("body", "Line one.\nLine two.")],
+        output_path=out,
+    )
+    assert rendered.path == out
+    assert out.exists()
+    assert out.read_bytes().startswith(b"%PDF-")
+
+
+def test_render_pdf_deterministic_dates(tmp_path: Path):
+    out_a = tmp_path / "a.pdf"
+    out_b = tmp_path / "b.pdf"
+    sections = [("only", "deterministic body content")]
+    render_pdf(title="Det", sections=sections, output_path=out_a)
+    render_pdf(title="Det", sections=sections, output_path=out_b)
+    # CreationDate / ModDate are scrubbed to a fixed value, so the two
+    # files should compare equal (modulo any other internal randomness
+    # — reportlab's basic outputs are deterministic given fixed inputs).
+    assert out_a.read_bytes() == out_b.read_bytes()
+
+
+def test_render_text_files_uses_filename_as_section(tmp_path: Path):
+    files_dir = tmp_path / "src"
+    files_dir.mkdir()
+    (files_dir / "admission_note.txt").write_text("history of present illness", encoding="utf-8")
+    (files_dir / "labs.txt").write_text("Na 138, K 4.0", encoding="utf-8")
+    out = tmp_path / "case.pdf"
+    rendered = render_text_files_to_pdf(
+        title="Case 1",
+        files=[files_dir / "admission_note.txt", files_dir / "labs.txt"],
+        output_path=out,
+    )
+    assert out.exists()
+    # We don't decode the PDF; the n_chars estimate should reflect both inputs.
+    assert rendered.n_chars >= len("history of present illness") + len("Na 138, K 4.0")
--- a/surfsense_evals/tests/core/test_pdf_render_with_images.py
+++ b/surfsense_evals/tests/core/test_pdf_render_with_images.py
@ -0,0 +1,73 @@
+"""Tests for ``render_pdf_with_images`` — covers image embedding +
+deterministic byte output, mirroring ``test_pdf_render.py`` for the
+text-only path.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.core.pdf import PdfImage, render_pdf_with_images
+
+
+@pytest.fixture
+def tiny_png(tmp_path: Path) -> Path:
+    """Generate a real 4x4 PNG via Pillow — embeds cleanly in reportlab.
+
+    Hand-crafted PNG headers tend to fail PIL's strict decoder, so we
+    delegate to Pillow which is already a transitive dep of reportlab.
+    """
+
+    from PIL import Image as PILImage
+
+    p = tmp_path / "pixel.png"
+    PILImage.new("RGB", (4, 4), color=(128, 128, 128)).save(p, format="PNG")
+    return p
+
+
+class TestRenderPdfWithImages:
+    def test_renders_pdf_with_no_images(self, tmp_path: Path) -> None:
+        out = tmp_path / "out.pdf"
+        rendered = render_pdf_with_images(
+            title="Test",
+            sections=[("Heading", "Body text here.", None)],
+            output_path=out,
+        )
+        assert rendered.path == out
+        assert out.exists()
+        assert out.read_bytes().startswith(b"%PDF-")
+
+    def test_renders_pdf_with_one_image(self, tmp_path: Path, tiny_png: Path) -> None:
+        out = tmp_path / "out.pdf"
+        render_pdf_with_images(
+            title="Test",
+            sections=[("Case", "Body text.", [PdfImage(path=tiny_png, caption="A pixel")])],
+            output_path=out,
+        )
+        assert out.exists()
+        assert out.stat().st_size > 200  # not empty
+
+    def test_deterministic_bytes(self, tmp_path: Path, tiny_png: Path) -> None:
+        out_a = tmp_path / "a.pdf"
+        out_b = tmp_path / "b.pdf"
+        sections = [
+            ("Case", "Some text.", [PdfImage(path=tiny_png, caption="cap")]),
+            ("Options", "A) one\nB) two", None),
+        ]
+        render_pdf_with_images(title="Test", sections=sections, output_path=out_a)
+        render_pdf_with_images(title="Test", sections=sections, output_path=out_b)
+        assert out_a.read_bytes() == out_b.read_bytes()
+
+    def test_skips_invalid_image_silently(self, tmp_path: Path) -> None:
+        """A bad image path should not abort the whole PDF render."""
+
+        out = tmp_path / "out.pdf"
+        render_pdf_with_images(
+            title="Test",
+            sections=[("Case", "Text", [PdfImage(path=tmp_path / "nope.jpg", caption="x")])],
+            output_path=out,
+        )
+        assert out.exists()
+        assert out.read_bytes().startswith(b"%PDF-")
--- a/surfsense_evals/tests/core/test_provider_openrouter.py
+++ b/surfsense_evals/tests/core/test_provider_openrouter.py
@ -0,0 +1,121 @@
+"""respx-mocked tests for the OpenRouter PDF provider."""
+
+from __future__ import annotations
+
+import base64
+import json
+from pathlib import Path
+
+import httpx
+import pytest
+import respx
+
+from surfsense_evals.core.providers.openrouter_pdf import (
+    OpenRouterPdfProvider,
+    PdfEngine,
+)
+
+_BASE = "https://openrouter.test"
+
+
+@pytest.fixture
+def tiny_pdf(tmp_path: Path) -> Path:
+    p = tmp_path / "case.pdf"
+    p.write_bytes(b"%PDF-1.4 minimal content")
+    return p
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_payload_shape_matches_openrouter_docs(respx_mock, tiny_pdf: Path):
+    captured = {}
+
+    def _capture(request):
+        captured["body"] = json.loads(request.content)
+        captured["headers"] = dict(request.headers)
+        return httpx.Response(
+            200,
+            json={
+                "choices": [{
+                    "message": {"content": "Answer: B"},
+                    "finish_reason": "stop",
+                }],
+                "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15, "cost": 0.0001},
+            },
+        )
+
+    respx_mock.post("/chat/completions").mock(side_effect=_capture)
+
+    provider = OpenRouterPdfProvider(
+        api_key="sk-or-test",
+        base_url=_BASE,
+        model="anthropic/claude-sonnet-4.5",
+        engine=PdfEngine.NATIVE,
+    )
+    response = await provider.complete(prompt="What is the diagnosis?", pdf_path=tiny_pdf)
+    body = captured["body"]
+    assert body["model"] == "anthropic/claude-sonnet-4.5"
+    assert body["plugins"] == [{"id": "file-parser", "pdf": {"engine": "native"}}]
+    user = body["messages"][-1]
+    assert user["role"] == "user"
+    file_part = user["content"][0]
+    assert file_part["type"] == "file"
+    assert file_part["file"]["filename"] == tiny_pdf.name
+    assert file_part["file"]["file_data"].startswith("data:application/pdf;base64,")
+    assert (
+        base64.b64decode(file_part["file"]["file_data"].split(",", 1)[1])
+        == tiny_pdf.read_bytes()  # noqa: ASYNC240 — test fixture, sync read is fine
+    )
+    assert user["content"][1] == {"type": "text", "text": "What is the diagnosis?"}
+    assert captured["headers"]["authorization"] == "Bearer sk-or-test"
+    assert captured["headers"].get("x-title") == "SurfSense-evals"
+
+    assert response.text == "Answer: B"
+    assert response.input_tokens == 10
+    assert response.output_tokens == 5
+    assert response.total_tokens == 15
+    # cost 0.0001 USD == 100 micros
+    assert response.cost_micros == 100
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_chat_array_content_concatenates(respx_mock, tiny_pdf: Path):
+    respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "choices": [{
+                    "message": {
+                        "content": [
+                            {"type": "text", "text": "Hello "},
+                            {"type": "text", "text": "world"},
+                            {"type": "image_url", "image_url": "ignored"},
+                        ]
+                    }
+                }],
+                "usage": {"prompt_tokens": 1, "completion_tokens": 1},
+            },
+        )
+    )
+    provider = OpenRouterPdfProvider(
+        api_key="sk-or-test", base_url=_BASE, model="x/y"
+    )
+    response = await provider.complete(prompt="hi", pdf_path=tiny_pdf)
+    assert response.text == "Hello world"
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_provider_raises_on_4xx(respx_mock, tiny_pdf: Path):
+    respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(429, json={"error": {"message": "rate limited"}})
+    )
+    provider = OpenRouterPdfProvider(api_key="sk-or-test", base_url=_BASE, model="x/y")
+    with pytest.raises(httpx.HTTPStatusError):
+        await provider.complete(prompt="hi", pdf_path=tiny_pdf)
+
+
+def test_missing_api_key_raises():
+    with pytest.raises(ValueError):
+        OpenRouterPdfProvider(api_key="", base_url=_BASE, model="x/y")
--- a/surfsense_evals/tests/core/test_registry.py
+++ b/surfsense_evals/tests/core/test_registry.py
@ -0,0 +1,58 @@
+"""Registry + auto-discovery tests.
+
+* Auto-discovery skips packages starting with ``_`` (so test fixtures
+  don't leak into the production catalogue).
+* Manually importing a ``_demo`` benchmark fires its ``register(...)``
+  call and the CLI sees it.
+"""
+
+from __future__ import annotations
+
+import importlib
+
+from surfsense_evals.core import registry
+
+
+def _force_register_demo() -> None:
+    """Import (or reload) the demo module so its ``register(...)`` runs.
+
+    On a fresh interpreter, ``import_module`` triggers package
+    initialization. After the first call though, the module is cached
+    in ``sys.modules`` and a second ``import_module`` is a no-op — so
+    if a previous test already unregistered the entry, we have to
+    ``reload`` to re-execute the module body.
+    """
+
+    module = importlib.import_module("surfsense_evals.suites._demo.hello")
+    if ("_demo", "hello") not in registry.snapshot():
+        importlib.reload(module)
+
+
+def test_auto_discovery_skips_underscore_prefixed_subpackages():
+    from surfsense_evals.suites import discover_suites
+
+    discovered = discover_suites()
+    assert all(not part.startswith("_") for full in discovered for part in full.split("."))
+    # The medical suite's headline benchmark must always discover.
+    assert any(name.endswith(".medical.medxpertqa") for name in discovered)
+
+
+def test_demo_benchmark_registers_on_explicit_import():
+    _force_register_demo()
+    bench = registry.get("_demo", "hello")
+    assert bench is not None
+    assert bench.name == "hello"
+    assert bench.headline is False
+    # Cleanup so the test is idempotent under repeated runs.
+    registry.unregister("_demo", "hello")
+
+
+def test_register_unregister_roundtrip():
+    # Make sure no stale entry from a prior test in the session.
+    if ("_demo", "hello") in registry.snapshot():
+        registry.unregister("_demo", "hello")
+    snapshot_before = dict(registry.snapshot())
+    _force_register_demo()
+    assert ("_demo", "hello") in registry.snapshot()
+    registry.unregister("_demo", "hello")
+    assert dict(registry.snapshot()) == snapshot_before
--- a/surfsense_evals/tests/core/test_scenarios.py
+++ b/surfsense_evals/tests/core/test_scenarios.py
@ -0,0 +1,68 @@
+"""Tests for the shared scenario formatter used in head-to-head reports."""
+
+from __future__ import annotations
+
+from surfsense_evals.core.scenarios import format_scenario_md
+
+
+def test_head_to_head_renders_both_arms_same_slug():
+    extra = {
+        "scenario": "head-to-head",
+        "provider_model": "anthropic/claude-sonnet-4.5",
+    }
+    line = format_scenario_md(extra)
+    assert "head-to-head" in line
+    assert "anthropic/claude-sonnet-4.5" in line
+
+
+def test_head_to_head_includes_vision_slug_when_recorded():
+    extra = {
+        "scenario": "head-to-head",
+        "provider_model": "anthropic/claude-sonnet-4.5",
+        "vision_provider_model": "anthropic/claude-sonnet-4.5",
+    }
+    line = format_scenario_md(extra)
+    assert "ingest VLM" in line
+    assert "claude-sonnet-4.5" in line
+
+
+def test_symmetric_cheap_calls_out_native_arm_disadvantage():
+    extra = {
+        "scenario": "symmetric-cheap",
+        "provider_model": "openai/gpt-5.4-mini",
+        "vision_provider_model": "anthropic/claude-sonnet-4.5",
+    }
+    line = format_scenario_md(extra)
+    assert "**symmetric-cheap**" in line
+    assert "gpt-5.4-mini" in line
+    # The "structurally loses" disclaimer must be there so reviewers
+    # don't read this as a fair comparison.
+    assert "structurally loses" in line.lower() or "structurally_loses" in line.lower()
+
+
+def test_cost_arbitrage_distinguishes_native_and_surfsense_slugs():
+    extra = {
+        "scenario": "cost-arbitrage",
+        "provider_model": "openai/gpt-5.4-mini",
+        "native_arm_model": "anthropic/claude-sonnet-4.5",
+        "vision_provider_model": "anthropic/claude-sonnet-4.5",
+    }
+    line = format_scenario_md(extra)
+    assert "**cost-arbitrage**" in line
+    # Both slugs surface; reader can see the asymmetry at a glance.
+    assert "anthropic/claude-sonnet-4.5" in line
+    assert "openai/gpt-5.4-mini" in line
+    assert "fraction of the per-query cost" in line
+
+
+def test_legacy_artifact_without_scenario_renders_as_head_to_head():
+    """Old run_artifact.json files don't have ``scenario`` — must still render."""
+
+    extra = {"provider_model": "anthropic/claude-sonnet-4.5"}
+    line = format_scenario_md(extra)
+    assert "head-to-head" in line
+
+
+def test_none_extra_does_not_crash():
+    line = format_scenario_md(None)
+    assert "head-to-head" in line
--- a/surfsense_evals/tests/core/test_vision_llm.py
+++ b/surfsense_evals/tests/core/test_vision_llm.py
@ -0,0 +1,121 @@
+"""Tests for vision LLM auto-pick + explicit-slug resolution."""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.core.clients.search_space import VisionLlmConfigEntry
+from surfsense_evals.core.vision_llm import (
+    RECOMMENDED_VISION_PRIORITY,
+    VisionConfigError,
+    resolve_vision_llm,
+)
+
+
+def _entry(*, id: int, model_name: str, provider: str = "OPENROUTER") -> VisionLlmConfigEntry:
+    return VisionLlmConfigEntry(
+        id=id,
+        name=f"OpenRouter • {model_name}",
+        provider=provider,
+        model_name=model_name,
+        is_auto_mode=False,
+        raw={},
+    )
+
+
+# ---------------------------------------------------------------------------
+# Explicit slug resolution
+# ---------------------------------------------------------------------------
+
+
+def test_explicit_slug_resolves_to_matching_config_id():
+    candidates = [
+        _entry(id=-101, model_name="anthropic/claude-sonnet-4.5"),
+        _entry(id=-102, model_name="openai/gpt-5"),
+    ]
+    resolved = resolve_vision_llm(candidates, explicit_slug="openai/gpt-5")
+    assert resolved.config_id == -102
+    assert resolved.provider_model == "openai/gpt-5"
+    assert resolved.selected_via == "explicit"
+
+
+def test_explicit_slug_with_no_match_raises_with_helpful_listing():
+    candidates = [_entry(id=-101, model_name="anthropic/claude-sonnet-4.5")]
+    with pytest.raises(VisionConfigError) as exc_info:
+        resolve_vision_llm(candidates, explicit_slug="some/missing-slug")
+    msg = str(exc_info.value)
+    assert "some/missing-slug" in msg
+    assert "anthropic/claude-sonnet-4.5" in msg  # surfaced as a sample
+
+
+def test_explicit_slug_skips_non_openrouter_entries():
+    """A YAML BYOK entry with a colliding model_name shouldn't accidentally match."""
+
+    candidates = [
+        _entry(id=42, model_name="openai/gpt-5", provider="OPENAI"),
+        _entry(id=-101, model_name="openai/gpt-5"),
+    ]
+    resolved = resolve_vision_llm(candidates, explicit_slug="openai/gpt-5")
+    assert resolved.config_id == -101  # the OpenRouter one, not the BYOK one
+
+
+# ---------------------------------------------------------------------------
+# Auto-pick by recommended priority
+# ---------------------------------------------------------------------------
+
+
+def test_auto_pick_walks_priority_list_in_order():
+    candidates = [
+        _entry(id=-300, model_name="google/gemini-2.5-pro"),
+        _entry(id=-200, model_name="anthropic/claude-opus-4.7"),
+        _entry(id=-100, model_name="anthropic/claude-sonnet-4.5"),
+    ]
+    resolved = resolve_vision_llm(candidates, explicit_slug=None)
+    # claude-sonnet-4.5 is first in the priority tuple, so it wins.
+    assert resolved.config_id == -100
+    assert resolved.provider_model == "anthropic/claude-sonnet-4.5"
+    assert resolved.selected_via == "auto-priority"
+
+
+def test_auto_pick_skips_to_next_priority_when_first_unavailable():
+    candidates = [
+        _entry(id=-200, model_name="anthropic/claude-opus-4.7"),
+        _entry(id=-300, model_name="google/gemini-2.5-pro"),
+    ]
+    resolved = resolve_vision_llm(candidates, explicit_slug=None)
+    # claude-sonnet-4.5 not registered → claude-opus-4.7 is next in priority.
+    assert resolved.provider_model == "anthropic/claude-opus-4.7"
+    assert resolved.selected_via == "auto-priority"
+
+
+def test_auto_pick_falls_back_to_first_openrouter_when_no_recommended_match():
+    candidates = [
+        _entry(id=-700, model_name="some/exotic-vision-model"),
+        _entry(id=-800, model_name="another/exotic-vision-model"),
+    ]
+    resolved = resolve_vision_llm(candidates, explicit_slug=None)
+    # Neither matches the priority list → first OpenRouter entry wins.
+    assert resolved.config_id == -700
+    assert resolved.selected_via == "auto-fallback"
+
+
+def test_auto_pick_with_zero_openrouter_candidates_raises():
+    candidates: list[VisionLlmConfigEntry] = []
+    with pytest.raises(VisionConfigError) as exc_info:
+        resolve_vision_llm(candidates, explicit_slug=None)
+    assert "vision_enabled: true" in str(exc_info.value)
+
+
+def test_auto_pick_ignores_non_openrouter_entries():
+    candidates = [
+        _entry(id=99, model_name="anthropic/claude-sonnet-4.5", provider="ANTHROPIC"),
+    ]
+    with pytest.raises(VisionConfigError):
+        resolve_vision_llm(candidates, explicit_slug=None)
+
+
+def test_recommended_priority_is_a_stable_public_list():
+    """If you reorder this, update the README's auto-pick claim too."""
+
+    assert RECOMMENDED_VISION_PRIORITY[0] == "anthropic/claude-sonnet-4.5"
+    assert "google/gemini-2.5-pro" in RECOMMENDED_VISION_PRIORITY
--- a/surfsense_evals/tests/suites/init.py
+++ b/surfsense_evals/tests/suites/init.py
@ -0,0 +1 @@
+
--- a/surfsense_evals/tests/suites/test_crag_dataset.py
+++ b/surfsense_evals/tests/suites/test_crag_dataset.py
@ -0,0 +1,224 @@
+"""Tests for the CRAG dataset loader (parser + sampling).
+
+The full bz2 download is excluded — these tests synthesise a tiny
+JSONL-bz2 in a tmp dir and verify the parser / stratified-sampler
+produce well-shaped objects.
+"""
+
+from __future__ import annotations
+
+import bz2
+import json
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.suites.research.crag.dataset import (
+    CragPage,
+    CragQuestion,
+    iter_questions,
+    stratified_sample,
+)
+
+
+def _make_jsonl_bz2(rows: list[dict], tmp_path: Path) -> Path:
+    """Write ``rows`` as one JSON object per line, bz2-compressed."""
+
+    dest = tmp_path / "fake.jsonl.bz2"
+    payload = "\n".join(json.dumps(r) for r in rows).encode("utf-8")
+    with bz2.open(dest, "wb") as fh:
+        fh.write(payload)
+    return dest
+
+
+def _row(
+    *,
+    interaction_id: str,
+    query: str,
+    answer: str,
+    domain: str = "movie",
+    question_type: str = "simple",
+    pages: list[dict] | None = None,
+    alt_ans: list[str] | None = None,
+    popularity: str = "head",
+    static_or_dynamic: str = "static",
+    split: int = 0,
+    query_time: str = "2024-04-01",
+) -> dict:
+    return {
+        "interaction_id": interaction_id,
+        "query_time": query_time,
+        "domain": domain,
+        "question_type": question_type,
+        "static_or_dynamic": static_or_dynamic,
+        "query": query,
+        "answer": answer,
+        "alt_ans": alt_ans or [],
+        "split": split,
+        "popularity": popularity,
+        "search_results": pages or [],
+    }
+
+
+class TestParser:
+    def test_basic_parse(self, tmp_path: Path) -> None:
+        rows = [
+            _row(
+                interaction_id="abc",
+                query="Who directed Inception?",
+                answer="Christopher Nolan",
+                pages=[{
+                    "page_name": "Inception (film)",
+                    "page_url": "https://en.wikipedia.org/wiki/Inception",
+                    "page_snippet": "snippet",
+                    "page_result": "<html>full html</html>",
+                    "page_last_modified": "2024-01-01",
+                }],
+            ),
+        ]
+        path = _make_jsonl_bz2(rows, tmp_path)
+        parsed = iter_questions(path)
+        assert len(parsed) == 1
+        q = parsed[0]
+        assert q.query == "Who directed Inception?"
+        assert q.gold_answer == "Christopher Nolan"
+        assert q.qid == "C00000"
+        assert q.domain == "movie"
+        assert q.question_type == "simple"
+        assert len(q.pages) == 1
+        page = q.pages[0]
+        assert page.page_name == "Inception (film)"
+        assert page.page_url == "https://en.wikipedia.org/wiki/Inception"
+
+    def test_skips_missing_query_or_answer(self, tmp_path: Path) -> None:
+        rows = [
+            _row(interaction_id="1", query="", answer="x"),
+            _row(interaction_id="2", query="ok?", answer=""),
+            _row(interaction_id="3", query="ok?", answer="x"),
+        ]
+        path = _make_jsonl_bz2(rows, tmp_path)
+        parsed = iter_questions(path)
+        assert len(parsed) == 1
+        assert parsed[0].interaction_id == "3"
+
+    def test_skips_empty_pages(self, tmp_path: Path) -> None:
+        rows = [
+            _row(
+                interaction_id="x",
+                query="q?",
+                answer="a",
+                pages=[
+                    {"page_url": "", "page_result": "<html/>"},  # no URL
+                    {"page_url": "https://x.test/", "page_result": ""},  # empty html
+                    {"page_url": "https://y.test/", "page_result": "<html>good</html>"},
+                ],
+            ),
+        ]
+        path = _make_jsonl_bz2(rows, tmp_path)
+        parsed = iter_questions(path)
+        assert len(parsed) == 1
+        assert len(parsed[0].pages) == 1
+        assert parsed[0].pages[0].page_url == "https://y.test/"
+
+    def test_alt_answers_parsed(self, tmp_path: Path) -> None:
+        rows = [
+            _row(interaction_id="z", query="q?", answer="42",
+                 alt_ans=["forty-two", "42.0"]),
+        ]
+        path = _make_jsonl_bz2(rows, tmp_path)
+        parsed = iter_questions(path)
+        assert parsed[0].alt_answers == ["forty-two", "42.0"]
+
+    def test_handles_malformed_line(self, tmp_path: Path) -> None:
+        # Manually construct a bz2 with one valid line and one garbage line.
+        good = json.dumps(_row(interaction_id="ok", query="q?", answer="a"))
+        path = tmp_path / "mixed.jsonl.bz2"
+        with bz2.open(path, "wb") as fh:
+            fh.write(b"not-json{\n")
+            fh.write((good + "\n").encode("utf-8"))
+        parsed = iter_questions(path)
+        # Malformed line is skipped; one good row survives at index 1.
+        assert len(parsed) == 1
+        assert parsed[0].interaction_id == "ok"
+
+
+class TestPageHash:
+    def test_url_hash_stable(self) -> None:
+        a = CragPage(
+            page_name="A", page_url="https://x.test/p?q=1",
+            page_snippet="", page_html="<html/>",
+        )
+        b = CragPage(
+            page_name="B", page_url="https://x.test/p?q=1",
+            page_snippet="", page_html="<html/>",
+        )
+        assert a.url_hash == b.url_hash
+        assert len(a.url_hash) == 12
+
+    def test_url_hash_unique(self) -> None:
+        a = CragPage(
+            page_name="A", page_url="https://x.test/a", page_snippet="", page_html="<html/>",
+        )
+        b = CragPage(
+            page_name="B", page_url="https://x.test/b", page_snippet="", page_html="<html/>",
+        )
+        assert a.url_hash != b.url_hash
+
+
+class TestStratifiedSample:
+    def _make_pool(self) -> list[CragQuestion]:
+        out: list[CragQuestion] = []
+        idx = 0
+        # 30 finance/simple, 20 movie/comparison, 5 sports/multi-hop.
+        for n, domain, qtype in (
+            (30, "finance", "simple"),
+            (20, "movie", "comparison"),
+            (5, "sports", "multi-hop"),
+        ):
+            for _ in range(n):
+                out.append(CragQuestion(
+                    qid=f"C{idx:05d}",
+                    interaction_id=f"i{idx}",
+                    query_time="2024-01-01",
+                    query=f"q{idx}?",
+                    gold_answer="a",
+                    alt_answers=[],
+                    domain=domain,
+                    question_type=qtype,
+                    static_or_dynamic="static",
+                    popularity="head",
+                    split=0,
+                    raw_index=idx,
+                    pages=[],
+                ))
+                idx += 1
+        return out
+
+    def test_sample_smaller_than_pool(self) -> None:
+        pool = self._make_pool()
+        sample = stratified_sample(pool, n=15, seed=7)
+        assert len(sample) == 15
+        # Should pull from all three buckets at least once.
+        domains = {q.domain for q in sample}
+        assert domains == {"finance", "movie", "sports"}
+
+    def test_sample_returns_pool_when_n_geq(self) -> None:
+        pool = self._make_pool()
+        sample = stratified_sample(pool, n=999, seed=1)
+        assert len(sample) == len(pool)
+
+    def test_sample_sorted_by_raw_index(self) -> None:
+        pool = self._make_pool()
+        sample = stratified_sample(pool, n=10, seed=42)
+        assert [q.raw_index for q in sample] == sorted(q.raw_index for q in sample)
+
+    def test_sample_deterministic(self) -> None:
+        pool = self._make_pool()
+        s1 = stratified_sample(pool, n=20, seed=11)
+        s2 = stratified_sample(pool, n=20, seed=11)
+        assert [q.qid for q in s1] == [q.qid for q in s2]
+
+    def test_n_zero_or_negative_returns_pool(self) -> None:
+        pool = self._make_pool()
+        assert len(stratified_sample(pool, n=0)) == len(pool)
+        assert len(stratified_sample(pool, n=-1)) == len(pool)
--- a/surfsense_evals/tests/suites/test_crag_dataset_task3.py
+++ b/surfsense_evals/tests/suites/test_crag_dataset_task3.py
@ -0,0 +1,259 @@
+"""Unit tests for CRAG Task 3 streaming dataset loader.
+
+We don't (and shouldn't) hit the real 7 GB upstream archive in
+unit tests. Instead we construct tiny tar.bz2 archives split across
+N parts and verify:
+
+* ``_MultiPartReader`` correctly stitches N files together.
+* The streaming path (multi → bz2 → tar → JSONL) yields parsed
+  ``CragQuestion`` rows with the right shape.
+* ``max_questions`` cap is honoured (early break, no greedy read).
+* ``parts_present`` correctly detects missing/empty parts.
+"""
+
+from __future__ import annotations
+
+import bz2
+import io
+import json
+import tarfile
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.suites.research.crag.dataset_task3 import (
+    _MultiPartReader,
+    iter_questions_task3,
+    parts_present,
+)
+
+
+# ---------------------------------------------------------------------------
+# Fixtures: build a tiny synthetic Task 3 archive
+# ---------------------------------------------------------------------------
+
+
+def _make_jsonl_payload(n_rows: int) -> bytes:
+    rows = []
+    for i in range(n_rows):
+        rows.append({
+            "interaction_id": f"int_{i:04d}",
+            "query_time": "2024-01-01 00:00:00",
+            "domain": ["finance", "music", "movie", "sports", "open"][i % 5],
+            "question_type": ["simple", "comparison", "aggregation", "multi-hop"][i % 4],
+            "static_or_dynamic": "static",
+            "popularity": "head",
+            "split": 0,
+            "query": f"Synthetic CRAG question {i}?",
+            "answer": f"answer-{i}",
+            "alt_ans": [f"alt-{i}-a", f"alt-{i}-b"],
+            "search_results": [
+                {
+                    "page_name": f"Page {j} for q{i}",
+                    "page_url": f"https://example.com/q{i}/p{j}",
+                    "page_snippet": "snippet",
+                    "page_result": f"<html><body><p>q{i} p{j} body</p></body></html>",
+                    "page_last_modified": "",
+                }
+                for j in range(50)
+            ],
+        })
+    return b"\n".join(json.dumps(r).encode("utf-8") for r in rows) + b"\n"
+
+
+def _make_tar_bz2(jsonl_bytes: bytes, *, member_name: str = "data.jsonl") -> bytes:
+    bio = io.BytesIO()
+    with bz2.BZ2File(bio, mode="wb") as bz:
+        with tarfile.open(fileobj=bz, mode="w") as tar:
+            info = tarfile.TarInfo(name=member_name)
+            info.size = len(jsonl_bytes)
+            tar.addfile(info, io.BytesIO(jsonl_bytes))
+    return bio.getvalue()
+
+
+def _make_tar_bz2_multi(shards: list[tuple[str, bytes]]) -> bytes:
+    """Build a tar.bz2 archive containing multiple JSONL shards.
+
+    Mirrors the real CRAG Task 3 layout: one tar with N JSONL members
+    named ``crag_task_3_dev_v4_{i}.jsonl`` (or whatever the caller
+    passes in).
+    """
+
+    bio = io.BytesIO()
+    with bz2.BZ2File(bio, mode="wb") as bz:
+        with tarfile.open(fileobj=bz, mode="w") as tar:
+            for name, payload in shards:
+                info = tarfile.TarInfo(name=name)
+                info.size = len(payload)
+                tar.addfile(info, io.BytesIO(payload))
+    return bio.getvalue()
+
+
+def _split_into_parts(blob: bytes, n_parts: int) -> list[bytes]:
+    """Split byte string into N roughly-equal chunks (last gets remainder)."""
+    chunk = max(1, len(blob) // n_parts)
+    parts = [blob[i * chunk : (i + 1) * chunk] for i in range(n_parts - 1)]
+    parts.append(blob[(n_parts - 1) * chunk :])
+    return parts
+
+
+@pytest.fixture
+def task3_parts_dir(tmp_path: Path) -> Path:
+    """A directory containing a 4-part synthetic CRAG Task 3 archive (12 rows)."""
+    blob = _make_tar_bz2(_make_jsonl_payload(12))
+    parts = _split_into_parts(blob, 4)
+    parts_dir = tmp_path / ".raw_cache"
+    parts_dir.mkdir()
+    for i, b in enumerate(parts, start=1):
+        (parts_dir / f"crag_task_3_dev_v4.tar.bz2.part{i}").write_bytes(b)
+    return parts_dir
+
+
+# ---------------------------------------------------------------------------
+# _MultiPartReader
+# ---------------------------------------------------------------------------
+
+
+class TestMultiPartReader:
+    def test_concatenates_parts_in_order(self, tmp_path: Path) -> None:
+        a = tmp_path / "a"
+        b = tmp_path / "b"
+        c = tmp_path / "c"
+        a.write_bytes(b"hello, ")
+        b.write_bytes(b"streaming ")
+        c.write_bytes(b"world!")
+        with _MultiPartReader([a, b, c]) as r:
+            assert r.read() == b"hello, streaming world!"
+
+    def test_read_n_crosses_part_boundary(self, tmp_path: Path) -> None:
+        a = tmp_path / "a"
+        b = tmp_path / "b"
+        a.write_bytes(b"AAA")
+        b.write_bytes(b"BBBB")
+        with _MultiPartReader([a, b]) as r:
+            # Read 5 bytes — straddles boundary between parts.
+            assert r.read(5) == b"AAABB"
+            assert r.read(5) == b"BB"
+            assert r.read(5) == b""
+
+    def test_close_is_idempotent(self, tmp_path: Path) -> None:
+        a = tmp_path / "a"
+        a.write_bytes(b"x")
+        r = _MultiPartReader([a])
+        r.close()
+        r.close()
+        with pytest.raises(ValueError):
+            r.read(1)
+
+    def test_missing_part_raises(self, tmp_path: Path) -> None:
+        with pytest.raises(FileNotFoundError):
+            _MultiPartReader([tmp_path / "does-not-exist"])
+
+    def test_empty_paths_raises(self) -> None:
+        with pytest.raises(ValueError):
+            _MultiPartReader([])
+
+
+# ---------------------------------------------------------------------------
+# iter_questions_task3
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def task3_multi_shard_dir(tmp_path: Path) -> Path:
+    """A 4-part archive whose tar contains 3 JSONL shards (4 + 4 + 4 rows)."""
+    payload_a = _make_jsonl_payload(4)
+    payload_b = _make_jsonl_payload(4)
+    payload_c = _make_jsonl_payload(4)
+    blob = _make_tar_bz2_multi([
+        ("crag_task_3_dev_v4_0.jsonl", payload_a),
+        ("crag_task_3_dev_v4_1.jsonl", payload_b),
+        ("crag_task_3_dev_v4_2.jsonl", payload_c),
+    ])
+    parts = _split_into_parts(blob, 4)
+    parts_dir = tmp_path / ".raw_cache"
+    parts_dir.mkdir()
+    for i, b in enumerate(parts, start=1):
+        (parts_dir / f"crag_task_3_dev_v4.tar.bz2.part{i}").write_bytes(b)
+    return parts_dir
+
+
+class TestIterQuestionsTask3:
+    def test_streams_full_archive(self, task3_parts_dir: Path) -> None:
+        questions = iter_questions_task3(task3_parts_dir)
+        assert len(questions) == 12
+        # All questions get the T3_ prefix and 50 pages each.
+        assert all(q.qid.startswith("T3_") for q in questions)
+        assert all(len(q.pages) == 50 for q in questions)
+        # Schema fields preserved.
+        first = questions[0]
+        assert first.query == "Synthetic CRAG question 0?"
+        assert first.gold_answer == "answer-0"
+        assert first.domain == "finance"
+        assert "alt-0-a" in first.alt_answers
+
+    def test_max_questions_caps_early(self, task3_parts_dir: Path) -> None:
+        questions = iter_questions_task3(task3_parts_dir, max_questions=3)
+        assert len(questions) == 3
+        # Sequential indices 0..2 — we don't skip rows.
+        assert [q.raw_index for q in questions] == [0, 1, 2]
+
+    def test_streams_multi_shard_archive(self, task3_multi_shard_dir: Path) -> None:
+        # Three shards × four rows each = twelve rows total.
+        questions = iter_questions_task3(task3_multi_shard_dir)
+        assert len(questions) == 12
+        # raw_index increments monotonically across shards.
+        assert [q.raw_index for q in questions] == list(range(12))
+        # qids are unique and sequential across shards.
+        assert len({q.qid for q in questions}) == 12
+
+    def test_max_questions_short_circuits_first_shard(self, task3_multi_shard_dir: Path) -> None:
+        # Cap < shard size — shouldn't touch shards 1 or 2 at all.
+        questions = iter_questions_task3(task3_multi_shard_dir, max_questions=2)
+        assert len(questions) == 2
+        # Both come from shard 0 (raw_index 0, 1).
+        assert [q.raw_index for q in questions] == [0, 1]
+
+    def test_max_questions_spans_shards(self, task3_multi_shard_dir: Path) -> None:
+        # Cap = 6 → all 4 from shard 0 + first 2 from shard 1.
+        questions = iter_questions_task3(task3_multi_shard_dir, max_questions=6)
+        assert len(questions) == 6
+        assert [q.raw_index for q in questions] == [0, 1, 2, 3, 4, 5]
+
+    def test_raises_when_no_jsonl_member(self, tmp_path: Path) -> None:
+        # Archive containing a non-jsonl member.
+        bio = io.BytesIO()
+        with bz2.BZ2File(bio, mode="wb") as bz:
+            with tarfile.open(fileobj=bz, mode="w") as tar:
+                info = tarfile.TarInfo(name="README.md")
+                payload = b"not jsonl"
+                info.size = len(payload)
+                tar.addfile(info, io.BytesIO(payload))
+        parts_dir = tmp_path / ".raw_cache"
+        parts_dir.mkdir()
+        for i, name in enumerate(
+            ("part1", "part2", "part3", "part4"), start=1,
+        ):
+            half = len(bio.getvalue()) // 4
+            chunk = bio.getvalue()[(i - 1) * half : i * half if i < 4 else len(bio.getvalue())]
+            (parts_dir / f"crag_task_3_dev_v4.tar.bz2.{name}").write_bytes(chunk)
+        with pytest.raises(RuntimeError, match="No JSONL member"):
+            iter_questions_task3(parts_dir)
+
+
+# ---------------------------------------------------------------------------
+# parts_present
+# ---------------------------------------------------------------------------
+
+
+class TestPartsPresent:
+    def test_all_present(self, task3_parts_dir: Path) -> None:
+        assert parts_present(task3_parts_dir) is True
+
+    def test_one_missing(self, task3_parts_dir: Path) -> None:
+        (task3_parts_dir / "crag_task_3_dev_v4.tar.bz2.part2").unlink()
+        assert parts_present(task3_parts_dir) is False
+
+    def test_one_empty(self, task3_parts_dir: Path) -> None:
+        (task3_parts_dir / "crag_task_3_dev_v4.tar.bz2.part3").write_bytes(b"")
+        assert parts_present(task3_parts_dir) is False
--- a/surfsense_evals/tests/suites/test_crag_grader.py
+++ b/surfsense_evals/tests/suites/test_crag_grader.py
@ -0,0 +1,248 @@
+"""Tests for the CRAG 3-class deterministic grader.
+
+The LLM-judge fallback is excluded here (network call); these tests
+exercise the deterministic shortcut + the special-case routing for
+``false_premise`` questions and refusal detection (``I don't know``).
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.suites.research.crag.grader import (
+    CragGradeResult,
+    _flags_false_premise,
+    _is_refusal,
+    _maybe_number,
+    _normalise,
+    _whole_word_substring,
+    grade_deterministic,
+)
+
+
+class TestNormalisation:
+    def test_lowercase_and_punct_stripped(self) -> None:
+        assert _normalise("Apple Inc.") == "apple inc"
+
+    def test_articles_removed(self) -> None:
+        assert _normalise("The Apple Watch") == "apple watch"
+
+    def test_empty_returns_empty(self) -> None:
+        assert _normalise("") == ""
+
+
+class TestNumericExtraction:
+    def test_simple_int(self) -> None:
+        assert _maybe_number("42") == 42.0
+
+    def test_int_with_commas(self) -> None:
+        assert _maybe_number("$1,234") == 1234.0
+
+    def test_year_in_sentence(self) -> None:
+        assert _maybe_number("released in 2008") == 2008.0
+
+    def test_word_number(self) -> None:
+        assert _maybe_number("seven") == 7.0
+
+
+class TestWholeWordSubstring:
+    def test_phrase_match(self) -> None:
+        assert _whole_word_substring("the new york yankees", "new york")
+
+    def test_word_boundary_required(self) -> None:
+        assert not _whole_word_substring("yorkshire", "york")
+
+
+class TestRefusalDetection:
+    def test_explicit_idk(self) -> None:
+        assert _is_refusal("Answer: I don't know")
+
+    def test_idk_no_apostrophe(self) -> None:
+        assert _is_refusal("I dont know")
+
+    def test_no_information(self) -> None:
+        assert _is_refusal("There is no information available about this.")
+
+    def test_unable_to_answer(self) -> None:
+        assert _is_refusal("I am unable to answer this question.")
+
+    def test_empty_is_refusal(self) -> None:
+        assert _is_refusal("")
+        assert _is_refusal("   ")
+
+    def test_real_answer_is_not_refusal(self) -> None:
+        assert not _is_refusal("Answer: Apple Inc")
+        assert not _is_refusal("The CEO is Tim Cook.")
+
+
+class TestFalsePremiseDetection:
+    def test_explicit_false_premise(self) -> None:
+        assert _flags_false_premise(
+            "The question contains a false premise; the company never had that product."
+        )
+
+    def test_no_such(self) -> None:
+        assert _flags_false_premise("There is no such album.")
+
+    def test_did_not_happen(self) -> None:
+        assert _flags_false_premise("That event did not happen.")
+
+    def test_does_not_exist(self) -> None:
+        assert _flags_false_premise("That movie does not exist.")
+
+    def test_normal_answer_is_not_premise_flag(self) -> None:
+        assert not _flags_false_premise("Apple, headquartered in Cupertino.")
+
+
+class TestGradeDeterministicHappyPath:
+    def test_exact_match_correct(self) -> None:
+        result = grade_deterministic(pred="Tim Cook", gold="Tim Cook", question_type="simple")
+        assert result.grade == "correct"
+        assert result.score == 1
+        assert result.method == "exact"
+
+    def test_substring_match(self) -> None:
+        result = grade_deterministic(
+            pred="The answer is Tim Cook, CEO of Apple.",
+            gold="Tim Cook",
+            question_type="simple",
+        )
+        assert result.grade == "correct"
+        assert result.method == "substring"
+
+    def test_alt_answer_match(self) -> None:
+        result = grade_deterministic(
+            pred="2,008",
+            gold="two thousand eight",
+            alt_answers=["2008"],
+            question_type="simple",
+        )
+        assert result.grade == "correct"
+        assert result.score == 1
+
+    def test_numeric_within_tolerance(self) -> None:
+        result = grade_deterministic(
+            pred="The revenue was $1,234,000 USD",
+            gold="$1,234,123",
+            question_type="aggregation",
+        )
+        assert result.grade == "correct"
+        assert result.method == "numeric"
+
+    def test_numeric_outside_tolerance(self) -> None:
+        result = grade_deterministic(
+            pred="100",
+            gold="500",
+            question_type="aggregation",
+        )
+        assert result.grade == "incorrect"
+        assert result.score == -1
+
+    def test_numeric_strict_small_currency(self) -> None:
+        # CRAG (unlike FRAMES) does not apply a 0.5 absolute floor —
+        # ``$2.05`` should NOT match ``$2.17`` (≈5.5% off, well over 1%).
+        result = grade_deterministic(
+            pred="$2.05",
+            gold="$2.17",
+            question_type="simple",
+        )
+        # Falls through to lexical_miss (no substring overlap either).
+        assert result.grade == "incorrect"
+        assert result.method == "lexical_miss"
+
+
+class TestGradeDeterministicRefusal:
+    def test_idk_maps_to_missing(self) -> None:
+        result = grade_deterministic(
+            pred="I don't know.", gold="Tim Cook", question_type="simple",
+        )
+        assert result.grade == "missing"
+        assert result.score == 0
+        assert result.method == "refusal"
+
+    def test_empty_pred_maps_to_missing(self) -> None:
+        result = grade_deterministic(pred="", gold="Tim Cook", question_type="simple")
+        assert result.grade == "missing"
+
+    def test_no_information_maps_to_missing(self) -> None:
+        result = grade_deterministic(
+            pred="There is not enough information to answer.",
+            gold="42",
+            question_type="simple",
+        )
+        assert result.grade == "missing"
+
+
+class TestGradeDeterministicFalsePremise:
+    def test_flagging_premise_is_correct(self) -> None:
+        result = grade_deterministic(
+            pred="The question contains a false premise; that movie does not exist.",
+            gold="invalid question",
+            question_type="false_premise",
+        )
+        assert result.grade == "correct"
+        assert result.method == "false_premise_flagged"
+
+    def test_committing_to_false_answer_is_unclear(self) -> None:
+        # Should land in false_premise_unclear → judge fallback territory.
+        result = grade_deterministic(
+            pred="The album was released in 2010.",
+            gold="invalid question",
+            question_type="false_premise",
+        )
+        assert result.grade == "incorrect"
+        assert result.method == "false_premise_unclear"
+
+    def test_idk_on_false_premise_is_missing(self) -> None:
+        # Refusal precedes false-premise routing.
+        result = grade_deterministic(
+            pred="I don't know.",
+            gold="invalid question",
+            question_type="false_premise",
+        )
+        assert result.grade == "missing"
+
+
+class TestGradeDeterministicLexicalMiss:
+    def test_unknown_paraphrase_routes_to_judge(self) -> None:
+        result = grade_deterministic(
+            pred="It is the technology giant in Cupertino.",
+            gold="Apple Inc",
+            question_type="simple",
+        )
+        # Without a judge, we fall through to lexical_miss → incorrect.
+        assert result.grade == "incorrect"
+        assert result.method == "lexical_miss"
+
+    def test_short_pred_no_substring_credit(self) -> None:
+        # Reverse-substring path requires len >= 3 to credit.
+        result = grade_deterministic(
+            pred="JK",
+            gold="JK Rowling",
+            question_type="simple",
+        )
+        assert result.grade == "incorrect"
+
+
+class TestGradeResultShape:
+    def test_to_dict_round_trip(self) -> None:
+        result = CragGradeResult(
+            grade="correct", score=1, method="exact",
+            normalised_pred="x", normalised_gold="x",
+        )
+        d = result.to_dict()
+        assert d["grade"] == "correct"
+        assert d["score"] == 1
+        assert d["method"] == "exact"
+
+    def test_score_matches_grade(self) -> None:
+        # Construct via grader so the score field is populated correctly.
+        for gold, pred, want_grade in (
+            ("hi", "hi", "correct"),
+            ("hi", "I don't know", "missing"),
+            ("hi", "bye", "incorrect"),
+        ):
+            result = grade_deterministic(pred=pred, gold=gold, question_type="simple")
+            assert result.grade == want_grade
+            expected_score = {"correct": 1, "missing": 0, "incorrect": -1}[want_grade]
+            assert result.score == expected_score
--- a/surfsense_evals/tests/suites/test_crag_html_extract.py
+++ b/surfsense_evals/tests/suites/test_crag_html_extract.py
@ -0,0 +1,149 @@
+"""Tests for the CRAG HTML extractor.
+
+We don't network-fetch trafilatura; we just verify the wrapper:
+
+* Strips obvious boilerplate (nav/footer/scripts) out of the result.
+* Falls back to the stdlib stripper on degenerate input.
+* Caps output at the configured ceiling.
+* Always prepends a metadata header (``# title``) when content is
+  produced.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.suites.research.crag.html_extract import (
+    extract_main_content,
+)
+
+
+_RICH_HTML = """\
+<!DOCTYPE html>
+<html>
+<head><title>Apple Q3 Earnings</title>
+<script>const a=1;</script>
+<style>body{font-family:sans;}</style>
+</head>
+<body>
+<nav><a href="/home">Home</a><a href="/about">About</a></nav>
+<header><h1>Tech News Site</h1><p>Subscribe to our newsletter</p></header>
+<main>
+<article>
+  <h1>Apple posts $90B revenue in Q3 2024</h1>
+  <p>Apple Inc. announced its Q3 2024 financial results today, reporting
+  $90 billion in revenue, beating analyst expectations of $87 billion.</p>
+  <p>The company saw growth across iPhone, services, and wearables.
+  CEO Tim Cook attributed the performance to strong demand in emerging
+  markets, particularly India.</p>
+  <h2>Segment breakdown</h2>
+  <ul>
+    <li>iPhone: $45B</li>
+    <li>Services: $24B</li>
+    <li>Mac: $7B</li>
+  </ul>
+</article>
+</main>
+<footer><p>Copyright 2024 Tech News Site. All rights reserved.</p></footer>
+</body></html>
+"""
+
+
+class TestExtractMainContent:
+    def test_extracts_main_article(self) -> None:
+        result = extract_main_content(
+            _RICH_HTML,
+            url="https://example.com/apple",
+            page_name="Apple Q3 Earnings",
+        )
+        assert result.ok
+        assert "Apple" in result.text
+        assert "Q3 2024" in result.text
+        # Header line is prepended.
+        assert result.text.startswith("# Apple Q3 Earnings")
+        assert "Source: https://example.com/apple" in result.text
+
+    def test_strips_boilerplate(self) -> None:
+        result = extract_main_content(
+            _RICH_HTML,
+            url="https://example.com/apple",
+            page_name="Apple Q3 Earnings",
+        )
+        assert result.ok
+        # Boilerplate strings should NOT make it through.
+        assert "Subscribe to our newsletter" not in result.text
+        assert "Copyright 2024 Tech News Site" not in result.text
+        assert "const a=1" not in result.text  # script content
+
+    def test_includes_last_modified_when_provided(self) -> None:
+        result = extract_main_content(
+            _RICH_HTML,
+            url="https://example.com/apple",
+            page_name="Apple Q3 Earnings",
+            last_modified="2024-08-01",
+        )
+        assert "Last modified: 2024-08-01" in result.text
+
+    def test_empty_html_returns_empty_result(self) -> None:
+        result = extract_main_content("", url="https://x.test/")
+        assert not result.ok
+        assert result.method == "empty"
+        assert result.n_chars == 0
+
+    def test_whitespace_only_html_is_empty(self) -> None:
+        result = extract_main_content("   \n   ", url="https://x.test/")
+        assert not result.ok
+
+    def test_garbage_html_falls_back(self) -> None:
+        # Trafilatura should reject this, fallback strip should still yield text.
+        result = extract_main_content(
+            "<<weird>>not a tag>>>The brown fox<<jumped<<",
+            url="https://x.test/garbage",
+            page_name="Garbage",
+        )
+        # Either trafilatura recovers something or fallback_strip does.
+        if result.ok:
+            assert "brown fox" in result.text or "jumped" in result.text
+
+
+class TestFallbackStripper:
+    def test_extract_when_no_clear_main(self) -> None:
+        html = """
+        <html><body>
+        <p>This is content one.</p>
+        <p>This is content two.</p>
+        </body></html>
+        """
+        result = extract_main_content(
+            html, url="https://x.test/", page_name="Title",
+        )
+        assert result.ok
+        assert "content one" in result.text
+        assert "content two" in result.text
+
+    def test_html_entities_decoded(self) -> None:
+        html = """<html><body>
+        <article>
+        <p>Tom &amp; Jerry &mdash; classic cartoon &copy; 1940.</p>
+        <p>It's a story about a cat &lt;Tom&gt; and a mouse &lt;Jerry&gt;.</p>
+        </article>
+        </body></html>"""
+        result = extract_main_content(html, url="https://x.test/")
+        assert result.ok
+        # & should be decoded
+        assert "&amp;" not in result.text
+        assert "Tom" in result.text and "Jerry" in result.text
+
+
+class TestOutputCapping:
+    def test_long_output_is_truncated(self) -> None:
+        # Generate enough content to exceed 200k cap.
+        body = "<p>" + ("hello world " * 50_000) + "</p>"
+        html = f"<html><body><article>{body}</article></body></html>"
+        result = extract_main_content(html, url="https://x.test/", page_name="long")
+        assert result.ok
+        # The body text itself + the metadata header. Truncation marker
+        # appears either at the body limit or before EOF.
+        if "[...truncated...]" in result.text:
+            # The truncation kicked in.
+            assert len(result.text) <= 250_000  # header + 200k cap + slack
--- a/surfsense_evals/tests/suites/test_frames_dataset.py
+++ b/surfsense_evals/tests/suites/test_frames_dataset.py
@ -0,0 +1,154 @@
+"""Tests for the FRAMES dataset parser.
+
+Network-free: we round-trip a tiny fixture TSV through pandas and
+``load_questions`` to confirm:
+
+* row indices become zero-padded ``Q###`` ids,
+* ``wiki_links`` (Python list literal) is materialised correctly,
+* ``reasoning_types`` is split on the pipe separator,
+* missing Prompt/Answer rows are dropped, and
+* the legacy ``wikipedia_link_*`` per-cell fallback works when
+  ``wiki_links`` is missing/empty.
+"""
+
+from __future__ import annotations
+
+import textwrap
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.suites.research.frames.dataset import (
+    FramesQuestion,
+    _parse_reasoning_types,
+    _parse_wiki_links,
+    load_questions,
+)
+
+
+# ---------------------------------------------------------------------------
+# Pure-function tests
+# ---------------------------------------------------------------------------
+
+
+class TestParseWikiLinks:
+    def test_python_list_literal(self) -> None:
+        s = "['https://en.wikipedia.org/wiki/A', 'https://en.wikipedia.org/wiki/B']"
+        assert _parse_wiki_links(s) == [
+            "https://en.wikipedia.org/wiki/A",
+            "https://en.wikipedia.org/wiki/B",
+        ]
+
+    def test_none_or_empty(self) -> None:
+        assert _parse_wiki_links(None) == []
+        assert _parse_wiki_links("") == []
+        assert _parse_wiki_links("[]") == []
+
+    def test_unquoted_csv_fallback(self) -> None:
+        # Defensive: non-Python-list strings still split on commas.
+        s = "https://a, https://b"
+        assert _parse_wiki_links(s) == ["https://a", "https://b"]
+
+    def test_already_a_list(self) -> None:
+        assert _parse_wiki_links(["x", "y"]) == ["x", "y"]
+
+
+class TestParseReasoningTypes:
+    def test_pipe_separated(self) -> None:
+        assert _parse_reasoning_types("Numerical reasoning | Multiple constraints") == [
+            "Numerical reasoning",
+            "Multiple constraints",
+        ]
+
+    def test_single_tag(self) -> None:
+        assert _parse_reasoning_types("Tabular reasoning") == ["Tabular reasoning"]
+
+    def test_empty(self) -> None:
+        assert _parse_reasoning_types(None) == []
+        assert _parse_reasoning_types("") == []
+
+
+# ---------------------------------------------------------------------------
+# Round-trip via pandas
+# ---------------------------------------------------------------------------
+
+
+def _write_tsv(path: Path, body: str) -> None:
+    """Helper that writes a tab-separated fixture exactly as the user typed it."""
+
+    path.write_text(textwrap.dedent(body), encoding="utf-8")
+
+
+def test_load_questions_basic(tmp_path: Path) -> None:
+    tsv = tmp_path / "test.tsv"
+    rows = [
+        # Header (first column is unnamed → pandas treats as index)
+        "\tPrompt\tAnswer\twikipedia_link_1\twikipedia_link_2\treasoning_types\twiki_links",
+        # Row 0
+        "0\tWho was the 15th president?\tJames Buchanan\t"
+        "https://en.wikipedia.org/wiki/James_Buchanan\t\t"
+        "Multiple constraints\t"
+        "['https://en.wikipedia.org/wiki/James_Buchanan']",
+        # Row 1
+        "1\tHow many years between A and B?\t87\t"
+        "https://en.wikipedia.org/wiki/A\thttps://en.wikipedia.org/wiki/B\t"
+        "Numerical reasoning | Temporal reasoning\t"
+        "['https://en.wikipedia.org/wiki/A', 'https://en.wikipedia.org/wiki/B']",
+        # Row 2 (intentionally missing Prompt — should be dropped)
+        "2\t\tunused\t\t\t\t",
+    ]
+    tsv.write_text("\n".join(rows) + "\n", encoding="utf-8")
+
+    questions = load_questions(tsv)
+    assert len(questions) == 2
+
+    q0, q1 = questions
+    assert isinstance(q0, FramesQuestion)
+    assert q0.qid == "Q000"
+    assert q0.raw_index == 0
+    assert q0.gold_answer == "James Buchanan"
+    assert q0.wiki_urls == ["https://en.wikipedia.org/wiki/James_Buchanan"]
+    assert q0.reasoning_types == ["Multiple constraints"]
+
+    assert q1.qid == "Q001"
+    assert q1.gold_answer == "87"
+    assert q1.wiki_urls == [
+        "https://en.wikipedia.org/wiki/A",
+        "https://en.wikipedia.org/wiki/B",
+    ]
+    assert q1.reasoning_types == ["Numerical reasoning", "Temporal reasoning"]
+
+
+def test_load_questions_falls_back_to_per_cell_links(tmp_path: Path) -> None:
+    """When ``wiki_links`` is empty, the loader should glue the
+    ``wikipedia_link_*`` cells back together."""
+
+    tsv = tmp_path / "test.tsv"
+    rows = [
+        "\tPrompt\tAnswer\twikipedia_link_1\twikipedia_link_2\treasoning_types\twiki_links",
+        "0\tQ?\tA\t"
+        "https://en.wikipedia.org/wiki/Cell1\thttps://en.wikipedia.org/wiki/Cell2\t"
+        "Numerical reasoning\t",
+    ]
+    tsv.write_text("\n".join(rows) + "\n", encoding="utf-8")
+    questions = load_questions(tsv)
+    assert len(questions) == 1
+    assert questions[0].wiki_urls == [
+        "https://en.wikipedia.org/wiki/Cell1",
+        "https://en.wikipedia.org/wiki/Cell2",
+    ]
+
+
+def test_load_questions_to_dict_round_trip(tmp_path: Path) -> None:
+    tsv = tmp_path / "test.tsv"
+    rows = [
+        "\tPrompt\tAnswer\treasoning_types\twiki_links",
+        "0\tQ?\tParis\tTemporal reasoning\t['https://en.wikipedia.org/wiki/Paris']",
+    ]
+    tsv.write_text("\n".join(rows) + "\n", encoding="utf-8")
+
+    [q] = load_questions(tsv)
+    d = q.to_dict()
+    assert d["qid"] == "Q000"
+    assert d["wiki_urls"] == ["https://en.wikipedia.org/wiki/Paris"]
+    assert d["reasoning_types"] == ["Temporal reasoning"]
--- a/surfsense_evals/tests/suites/test_frames_grader.py
+++ b/surfsense_evals/tests/suites/test_frames_grader.py
@ -0,0 +1,160 @@
+"""Tests for the FRAMES grader's deterministic shortcut.
+
+The LLM-judge fallback is excluded here (network call); we just
+confirm the rule-based path picks up obvious correct/incorrect
+cases and routes the ambiguous ones to ``lexical_miss`` so the
+runner knows to consult the judge.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.suites.research.frames.grader import (
+    GradeResult,
+    _maybe_number,
+    _normalise,
+    _whole_word_substring,
+    grade_deterministic,
+)
+
+
+class TestNormalisation:
+    def test_lowercase_and_punct_stripped(self) -> None:
+        assert _normalise("Jane Ballou.") == "jane ballou"
+
+    def test_articles_removed(self) -> None:
+        assert _normalise("The Eiffel Tower") == "eiffel tower"
+
+    def test_whitespace_squashed(self) -> None:
+        assert _normalise("  multi   space\tinput  ") == "multi space input"
+
+    def test_empty_returns_empty(self) -> None:
+        assert _normalise("") == ""
+        assert _normalise(None) == ""  # type: ignore[arg-type]
+
+
+class TestNumericExtraction:
+    def test_simple_int(self) -> None:
+        assert _maybe_number("42") == 42.0
+
+    def test_int_with_commas(self) -> None:
+        assert _maybe_number("1,234") == 1234.0
+
+    def test_year_in_sentence(self) -> None:
+        assert _maybe_number("It was published in 1847.") == 1847.0
+
+    def test_word_number(self) -> None:
+        assert _maybe_number("five") == 5.0
+        assert _maybe_number("Twenty") == 20.0
+
+    def test_no_number_returns_none(self) -> None:
+        assert _maybe_number("Jane Ballou") is None
+        assert _maybe_number("") is None
+
+
+class TestWholeWordSubstring:
+    def test_phrase_match(self) -> None:
+        assert _whole_word_substring("president of the united states", "united states")
+
+    def test_word_boundary_required(self) -> None:
+        # "states" should NOT match inside "statesman"
+        assert not _whole_word_substring("the renowned statesman", "states")
+
+    def test_empty_needle(self) -> None:
+        assert not _whole_word_substring("anything", "")
+
+
+class TestExactMatch:
+    def test_identical(self) -> None:
+        r = grade_deterministic(pred="Jane Ballou", gold="Jane Ballou")
+        assert r.correct is True
+        assert r.method == "exact"
+
+    def test_case_insensitive(self) -> None:
+        r = grade_deterministic(pred="paris", gold="Paris")
+        assert r.correct is True
+        assert r.method == "exact"
+
+    def test_punctuation_ignored(self) -> None:
+        r = grade_deterministic(pred="Jane Ballou.", gold="Jane Ballou")
+        assert r.correct is True
+
+
+class TestNumericPath:
+    def test_int_match(self) -> None:
+        r = grade_deterministic(pred="The answer is 87", gold="87")
+        assert r.correct is True
+        assert r.method == "numeric"
+
+    def test_word_number_matches_digit(self) -> None:
+        r = grade_deterministic(pred="five", gold="5")
+        assert r.correct is True
+        assert r.method == "numeric"
+
+    def test_off_by_more_than_tolerance_fails(self) -> None:
+        r = grade_deterministic(pred="86", gold="87")
+        # 86 vs 87, abs diff = 1, tol = max(0.01*87, 0.5) = 0.87 → fails
+        assert r.correct is False
+        assert r.method == "numeric_miss"
+
+    def test_within_one_percent_passes(self) -> None:
+        r = grade_deterministic(pred="100", gold="101")
+        # 1.0 abs diff, tol = max(0.01*101, 0.5) = 1.01 → passes
+        assert r.correct is True
+
+
+class TestSubstringPath:
+    def test_pred_contains_gold(self) -> None:
+        r = grade_deterministic(
+            pred="The answer is Jane Ballou according to records",
+            gold="Jane Ballou",
+        )
+        assert r.correct is True
+        assert r.method == "substring"
+
+    def test_gold_contains_pred_with_minimum_length(self) -> None:
+        # Gold = "John F Kennedy", pred = "Kennedy" → reverse substring,
+        # ≥3 chars, but the FRAMES style usually accepts this.
+        r = grade_deterministic(pred="Kennedy", gold="John F. Kennedy")
+        assert r.correct is True
+        assert r.method == "substring_reverse"
+
+    def test_too_short_pred_no_reverse_credit(self) -> None:
+        r = grade_deterministic(pred="of", gold="World of Warcraft")
+        # "of" passes length but is a stopword; the article-stripping
+        # normaliser removes it from gold, so substring fails. Either
+        # way, the grader should NOT credit this.
+        assert r.correct is False
+
+
+class TestLexicalMiss:
+    def test_completely_different_pred_falls_through(self) -> None:
+        r = grade_deterministic(pred="London", gold="Paris")
+        assert r.correct is False
+        assert r.method == "lexical_miss"
+
+    def test_empty_pred(self) -> None:
+        r = grade_deterministic(pred="", gold="Paris")
+        assert r.correct is False
+        assert r.method == "empty_pred"
+
+    def test_empty_gold_defensive(self) -> None:
+        r = grade_deterministic(pred="something", gold="")
+        # Defensive guard — gold should never be empty in practice.
+        assert r.correct is False
+        assert r.method == "empty_gold"
+
+
+class TestGradeResultShape:
+    def test_dict_has_all_expected_keys(self) -> None:
+        r = grade_deterministic(pred="Paris", gold="Paris")
+        d = r.to_dict()
+        assert set(d) >= {
+            "correct",
+            "f1",
+            "method",
+            "normalised_pred",
+            "normalised_gold",
+            "judge_rationale",
+        }
--- a/surfsense_evals/tests/suites/test_frames_wiki_fetch.py
+++ b/surfsense_evals/tests/suites/test_frames_wiki_fetch.py
@ -0,0 +1,112 @@
+"""Tests for the FRAMES Wikipedia fetcher.
+
+We mock the MW API with respx so tests are network-free. Coverage:
+
+* URL → title parsing (percent-encoded, underscores, redirects)
+* Filename safety (slashes, special chars)
+* Cache hit short-circuits the API call
+* Missing pages return ``None`` (not an exception)
+* Successful fetches write ``# Title`` markdown to disk
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import httpx
+import pytest
+import respx
+
+from surfsense_evals.suites.research.frames.wiki_fetch import (
+    WIKI_API,
+    WikiFetcher,
+    cache_filename_for_title,
+    title_from_url,
+)
+
+
+class TestTitleFromUrl:
+    def test_basic(self) -> None:
+        assert title_from_url("https://en.wikipedia.org/wiki/James_Buchanan") == "James Buchanan"
+
+    def test_percent_encoded(self) -> None:
+        assert (
+            title_from_url("https://en.wikipedia.org/wiki/Charlotte_Bront%C3%AB")
+            == "Charlotte Brontë"
+        )
+
+    def test_query_string_dropped(self) -> None:
+        assert title_from_url("https://en.wikipedia.org/wiki/Foo?action=edit") == "Foo"
+
+    def test_non_wiki_raises(self) -> None:
+        with pytest.raises(ValueError):
+            title_from_url("https://example.com/wiki/Foo")
+
+
+class TestCacheFilename:
+    def test_simple(self) -> None:
+        assert cache_filename_for_title("James Buchanan") == "James_Buchanan.md"
+
+    def test_unicode_replaced_with_underscore(self) -> None:
+        # Brontë's diaeresis is non-ASCII so the regex replaces it with `_`.
+        # The space → `_` happens after the unicode swap, so the final
+        # name has exactly one underscore for the diaeresis. Acceptable:
+        # filenames stay round-trippable as long as the rule is deterministic.
+        assert cache_filename_for_title("Charlotte Brontë") == "Charlotte_Bront_.md"
+
+    def test_slashes_replaced(self) -> None:
+        # Wikipedia titles can contain ``/`` (e.g. "I/O"), which would
+        # break the filesystem layout if not sanitised.
+        assert cache_filename_for_title("I/O") == "I_O.md"
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_fetch_success_writes_markdown(tmp_path: Path) -> None:
+    respx.get(WIKI_API).mock(return_value=httpx.Response(
+        200,
+        json={"query": {"pages": [{
+            "pageid": 1,
+            "title": "James Buchanan",
+            "extract": "James Buchanan was the 15th president of the United States.",
+        }]}},
+    ))
+    fetcher = WikiFetcher(cache_dir=tmp_path, rate_limit_rps=100)  # disable throttle
+    article = await fetcher.fetch("https://en.wikipedia.org/wiki/James_Buchanan")
+    assert article is not None
+    assert article.title == "James Buchanan"
+    body = article.markdown_path.read_text(encoding="utf-8")
+    assert body.startswith("# James Buchanan")
+    assert "15th president" in body
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_fetch_missing_page_returns_none(tmp_path: Path) -> None:
+    respx.get(WIKI_API).mock(return_value=httpx.Response(
+        200,
+        json={"query": {"pages": [{
+            "title": "DoesNotExist",
+            "missing": True,
+        }]}},
+    ))
+    fetcher = WikiFetcher(cache_dir=tmp_path, rate_limit_rps=100)
+    article = await fetcher.fetch("https://en.wikipedia.org/wiki/DoesNotExist")
+    assert article is None
+    assert not (tmp_path / "DoesNotExist.md").exists()
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_fetch_cache_hit_skips_api(tmp_path: Path) -> None:
+    # Pre-populate the cache.
+    cached = tmp_path / cache_filename_for_title("Cached Page")
+    cached.write_text("# Cached Page\n\nfrom disk\n", encoding="utf-8")
+    fetcher = WikiFetcher(cache_dir=tmp_path, rate_limit_rps=100)
+
+    # No respx mock registered; if the fetcher hits the network, respx
+    # would error out (it intercepts everything inside the decorator).
+    article = await fetcher.fetch("https://en.wikipedia.org/wiki/Cached_Page")
+    assert article is not None
+    assert article.markdown_path == cached
+    assert article.markdown_path.read_text(encoding="utf-8").endswith("from disk\n")
--- a/surfsense_evals/tests/suites/test_mmlongbench_grader.py
+++ b/surfsense_evals/tests/suites/test_mmlongbench_grader.py
@ -0,0 +1,129 @@
+"""Tests for the MMLongBench-Doc format-aware grader.
+
+The grader is the critical correctness piece for the open-ended
+benchmark (no MCQ shortcut), so we cover all five formats with
+representative happy-path + edge-case rows.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.suites.multimodal_doc.mmlongbench.grader import grade
+
+
+class TestStrFormat:
+    def test_exact_match(self) -> None:
+        r = grade(pred="Apollo 11", gold="Apollo 11", answer_format="Str")
+        assert r.correct is True
+        assert r.f1 == 1.0
+        assert r.method == "str_norm"
+
+    def test_lowercase_normalised(self) -> None:
+        r = grade(pred="paris", gold="Paris", answer_format="Str")
+        assert r.correct is True
+
+    def test_punctuation_difference_drops_to_substring(self) -> None:
+        # "N.A.S.A." normalises to "n a s a" (whitespace tokens) which
+        # doesn't equal "nasa" — but the F1 token overlap is still 0
+        # because none of the single letters appear standalone in "nasa".
+        # We assert the grader fails closed rather than over-claiming.
+        r = grade(pred="N.A.S.A.", gold="NASA", answer_format="Str")
+        assert r.correct is False  # explicit: this is a failure mode we accept
+
+    def test_substring_credit(self) -> None:
+        r = grade(pred="The answer is Paris.", gold="Paris", answer_format="Str")
+        assert r.correct is True
+
+    def test_completely_wrong(self) -> None:
+        r = grade(pred="London", gold="Paris", answer_format="Str")
+        assert r.correct is False
+        assert r.f1 < 0.5
+
+    def test_empty_pred(self) -> None:
+        r = grade(pred="", gold="Paris", answer_format="Str")
+        assert r.correct is False
+        assert r.f1 == 0.0
+
+
+class TestIntFormat:
+    def test_exact_int(self) -> None:
+        assert grade(pred="42", gold="42", answer_format="Int").correct is True
+
+    def test_int_in_sentence(self) -> None:
+        assert grade(pred="The answer is 42 years.", gold="42", answer_format="Int").correct is True
+
+    def test_int_with_commas(self) -> None:
+        assert grade(pred="1,500", gold="1500", answer_format="Int").correct is True
+
+    def test_wrong_int(self) -> None:
+        assert grade(pred="41", gold="42", answer_format="Int").correct is False
+
+    def test_no_int_in_pred(self) -> None:
+        assert grade(pred="not answerable", gold="42", answer_format="Int").correct is False
+
+
+class TestFloatFormat:
+    def test_exact_float(self) -> None:
+        assert grade(pred="3.14", gold="3.14", answer_format="Float").correct is True
+
+    def test_within_tolerance(self) -> None:
+        # 1% tolerance — 3.14 vs 3.13 is well within.
+        assert grade(pred="3.13", gold="3.14", answer_format="Float").correct is True
+
+    def test_outside_tolerance(self) -> None:
+        assert grade(pred="3.5", gold="3.14", answer_format="Float").correct is False
+
+    def test_european_decimal_comma(self) -> None:
+        # ``3,14`` should parse as 3.14
+        assert grade(pred="3,14", gold="3.14", answer_format="Float").correct is True
+
+    def test_zero_gold_with_small_abs_diff(self) -> None:
+        # Absolute tolerance of 0.01 should kick in for near-zero golds.
+        assert grade(pred="0.005", gold="0", answer_format="Float").correct is True
+
+
+class TestListFormat:
+    def test_exact_set_match(self) -> None:
+        r = grade(pred="apple, banana, cherry", gold="apple, banana, cherry", answer_format="List")
+        assert r.correct is True
+        assert r.f1 == pytest.approx(1.0)
+
+    def test_set_match_different_order(self) -> None:
+        r = grade(pred="cherry, apple, banana", gold="apple, banana, cherry", answer_format="List")
+        assert r.correct is True
+
+    def test_partial_overlap_gives_f1(self) -> None:
+        r = grade(pred="apple, banana", gold="apple, banana, cherry", answer_format="List")
+        assert r.correct is False
+        assert 0.0 < r.f1 < 1.0
+
+    def test_extra_items_lower_precision(self) -> None:
+        r = grade(pred="apple, banana, cherry, date", gold="apple, banana, cherry", answer_format="List")
+        assert 0.0 < r.f1 < 1.0
+        # Recall=1, precision=3/4 → F1 ~= 0.857
+        assert r.f1 == pytest.approx(2 * (3 / 4) * 1 / (3 / 4 + 1), rel=1e-3)
+
+
+class TestNoneFormat:
+    def test_unknown_phrase_credited(self) -> None:
+        for phrase in ("Not answerable", "I cannot answer this.", "No answer", "N/A"):
+            r = grade(pred=phrase, gold="Not answerable", answer_format="None")
+            assert r.correct is True, phrase
+
+    def test_actual_answer_marked_wrong(self) -> None:
+        # The arm hallucinated an answer when it should have said "I don't know".
+        r = grade(pred="The answer is 42.", gold="Not answerable", answer_format="None")
+        assert r.correct is False
+
+
+class TestUnknownFormatFallsBackToStr:
+    def test_blank_format_uses_str_grader(self) -> None:
+        r = grade(pred="Paris", gold="Paris", answer_format="")
+        assert r.correct is True
+        assert r.method == "str_norm"
+
+    def test_garbage_format_uses_str_grader(self) -> None:
+        r = grade(pred="Paris", gold="Paris", answer_format="quux")
+        assert r.correct is True
+        assert r.method == "str_norm"
--- a/surfsense_evals/tests/test_integration_smoke.py
+++ b/surfsense_evals/tests/test_integration_smoke.py
@ -0,0 +1,35 @@
+"""Opt-in integration smoke against ``http://localhost:8000``.
+
+Run with ``pytest -m integration``. Skipped by default. Touches the
+real backend — requires it to be reachable, OPENROUTER_API_KEY
+unrelated, and one credential mode set.
+"""
+
+from __future__ import annotations
+
+import os
+
+import httpx
+import pytest
+
+from surfsense_evals.core.auth import acquire_token, client_with_auth
+from surfsense_evals.core.config import load_config
+
+pytestmark = pytest.mark.integration
+
+
+@pytest.mark.asyncio
+async def test_smoke_against_localhost():
+    if "SURFSENSE_API_BASE" not in os.environ:
+        pytest.skip("SURFSENSE_API_BASE not set; skipping integration smoke")
+    config = load_config()
+    if config.credential_mode() == "none":
+        pytest.skip("No credentials in environment; skipping integration smoke")
+    bundle = await acquire_token(config)
+    async with client_with_auth(config, bundle) as client:
+        response = await client.get(f"{config.surfsense_api_base}/api/v1/global-new-llm-configs")
+        try:
+            response.raise_for_status()
+        except httpx.HTTPStatusError as exc:
+            pytest.fail(f"Backend rejected smoke call: {exc!s}")
+        assert isinstance(response.json(), list)