mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-19 18:45:15 +02:00
chore: evals
This commit is contained in:
parent
2402b730fa
commit
3737118050
122 changed files with 22598 additions and 13 deletions
1
surfsense_evals/tests/core/__init__.py
Normal file
1
surfsense_evals/tests/core/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
|
||||
95
surfsense_evals/tests/core/test_auth.py
Normal file
95
surfsense_evals/tests/core/test_auth.py
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
"""Auth credential resolution + 401 refresh hook."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import respx
|
||||
|
||||
from surfsense_evals.core.auth import (
|
||||
CredentialError,
|
||||
acquire_token,
|
||||
client_with_auth,
|
||||
)
|
||||
from surfsense_evals.core.config import Config
|
||||
|
||||
|
||||
def _make_config(**overrides) -> Config:
|
||||
base = {
|
||||
"surfsense_api_base": "http://test",
|
||||
"openrouter_api_key": None,
|
||||
"openrouter_base_url": "https://openrouter.ai/api/v1",
|
||||
"surfsense_jwt": None,
|
||||
"surfsense_refresh_token": None,
|
||||
"surfsense_user_email": None,
|
||||
"surfsense_user_password": None,
|
||||
"data_dir": None,
|
||||
"reports_dir": None,
|
||||
}
|
||||
base.update(overrides)
|
||||
# Path objects required by Config; tests don't touch the FS.
|
||||
from pathlib import Path
|
||||
|
||||
base["data_dir"] = base["data_dir"] or Path("/tmp/eval_test_data")
|
||||
base["reports_dir"] = base["reports_dir"] or Path("/tmp/eval_test_reports")
|
||||
return Config(**base)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_acquire_token_jwt_mode_short_circuits():
|
||||
config = _make_config(surfsense_jwt="abc", surfsense_refresh_token="ref")
|
||||
bundle = await acquire_token(config)
|
||||
assert bundle.access_token == "abc"
|
||||
assert bundle.refresh_token == "ref"
|
||||
assert bundle.mode == "jwt"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@respx.mock
|
||||
async def test_acquire_token_local_mode_posts_form():
|
||||
respx.post("http://test/auth/jwt/login").mock(
|
||||
return_value=httpx.Response(
|
||||
200, json={"access_token": "T", "refresh_token": "R", "token_type": "bearer"}
|
||||
)
|
||||
)
|
||||
config = _make_config(
|
||||
surfsense_user_email="u@example.com", surfsense_user_password="pw"
|
||||
)
|
||||
bundle = await acquire_token(config)
|
||||
assert bundle.access_token == "T"
|
||||
assert bundle.refresh_token == "R"
|
||||
assert bundle.mode == "local"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_acquire_token_no_credentials():
|
||||
config = _make_config()
|
||||
with pytest.raises(CredentialError) as exc:
|
||||
await acquire_token(config)
|
||||
assert "SURFSENSE_USER_EMAIL" in str(exc.value)
|
||||
assert "SURFSENSE_JWT" in str(exc.value)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@respx.mock
|
||||
async def test_client_with_auth_refreshes_on_401():
|
||||
config = _make_config(surfsense_jwt="old", surfsense_refresh_token="ref")
|
||||
bundle = await acquire_token(config)
|
||||
|
||||
respx.post("http://test/auth/jwt/refresh").mock(
|
||||
return_value=httpx.Response(200, json={"access_token": "new", "refresh_token": "ref2"})
|
||||
)
|
||||
# First call returns 401; the retry (post-refresh) returns 200.
|
||||
respx.get("http://test/api/v1/searchspaces").mock(
|
||||
side_effect=[
|
||||
httpx.Response(401, json={"detail": "expired"}),
|
||||
httpx.Response(200, json=[]),
|
||||
]
|
||||
)
|
||||
|
||||
async with client_with_auth(config, bundle) as client:
|
||||
response = await client.get("http://test/api/v1/searchspaces")
|
||||
|
||||
assert response.status_code == 200
|
||||
assert bundle.access_token == "new"
|
||||
assert bundle.refresh_token == "ref2"
|
||||
262
surfsense_evals/tests/core/test_clients.py
Normal file
262
surfsense_evals/tests/core/test_clients.py
Normal file
|
|
@ -0,0 +1,262 @@
|
|||
"""respx-mocked tests for the SurfSense HTTP clients."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import respx
|
||||
|
||||
from surfsense_evals.core.clients import (
|
||||
DocumentsClient,
|
||||
NewChatClient,
|
||||
SearchSpaceClient,
|
||||
)
|
||||
from surfsense_evals.core.clients.new_chat import ThreadBusyError
|
||||
|
||||
_BASE = "http://test"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def http() -> httpx.AsyncClient:
|
||||
return httpx.AsyncClient(base_url=_BASE)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SearchSpaceClient
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@respx.mock(base_url=_BASE)
|
||||
async def test_create_search_space_returns_row(respx_mock, http):
|
||||
respx_mock.post("/api/v1/searchspaces").mock(
|
||||
return_value=httpx.Response(
|
||||
200,
|
||||
json={
|
||||
"id": 99,
|
||||
"name": "eval-medical-2026",
|
||||
"description": None,
|
||||
"user_id": "user-x",
|
||||
"citations_enabled": True,
|
||||
"qna_custom_instructions": None,
|
||||
},
|
||||
)
|
||||
)
|
||||
client = SearchSpaceClient(http, _BASE)
|
||||
row = await client.create("eval-medical-2026")
|
||||
assert row.id == 99
|
||||
assert row.name == "eval-medical-2026"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@respx.mock(base_url=_BASE)
|
||||
async def test_delete_search_space_idempotent_on_404(respx_mock, http):
|
||||
respx_mock.delete("/api/v1/searchspaces/42").mock(
|
||||
return_value=httpx.Response(404, json={"detail": "gone"})
|
||||
)
|
||||
client = SearchSpaceClient(http, _BASE)
|
||||
await client.delete(42) # must not raise
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@respx.mock(base_url=_BASE)
|
||||
async def test_set_llm_preferences_partial_update(respx_mock, http):
|
||||
route = respx_mock.put("/api/v1/search-spaces/42/llm-preferences").mock(
|
||||
return_value=httpx.Response(
|
||||
200,
|
||||
json={
|
||||
"agent_llm_id": -10042,
|
||||
"document_summary_llm_id": None,
|
||||
"image_generation_config_id": None,
|
||||
"vision_llm_config_id": None,
|
||||
"agent_llm": {
|
||||
"id": -10042,
|
||||
"provider": "OPENROUTER",
|
||||
"model_name": "anthropic/claude-sonnet-4.5",
|
||||
},
|
||||
},
|
||||
)
|
||||
)
|
||||
client = SearchSpaceClient(http, _BASE)
|
||||
prefs = await client.set_llm_preferences(42, agent_llm_id=-10042)
|
||||
assert prefs.agent_llm_id == -10042
|
||||
assert prefs.agent_llm["provider"] == "OPENROUTER"
|
||||
sent_body = json.loads(route.calls[-1].request.content)
|
||||
assert sent_body == {"agent_llm_id": -10042}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DocumentsClient
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@respx.mock(base_url=_BASE)
|
||||
async def test_documents_status_parses_state(respx_mock, http):
|
||||
respx_mock.get("/api/v1/documents/status").mock(
|
||||
return_value=httpx.Response(
|
||||
200,
|
||||
json={
|
||||
"items": [
|
||||
{"id": 1, "title": "a.pdf", "document_type": "FILE",
|
||||
"status": {"state": "ready", "reason": None}},
|
||||
{"id": 2, "title": "b.pdf", "document_type": "FILE",
|
||||
"status": {"state": "failed", "reason": "ETL boom"}},
|
||||
]
|
||||
},
|
||||
)
|
||||
)
|
||||
client = DocumentsClient(http, _BASE)
|
||||
statuses = await client.get_status(search_space_id=1, document_ids=[1, 2])
|
||||
assert {s.document_id for s in statuses} == {1, 2}
|
||||
assert {s.is_ready for s in statuses} == {True, False}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@respx.mock(base_url=_BASE)
|
||||
async def test_documents_upload_returns_payload(respx_mock, http, tmp_path: Path):
|
||||
f1 = tmp_path / "a.pdf"
|
||||
f1.write_bytes(b"%PDF-1.4 small")
|
||||
respx_mock.post("/api/v1/documents/fileupload").mock(
|
||||
return_value=httpx.Response(
|
||||
200,
|
||||
json={
|
||||
"message": "Files uploaded",
|
||||
"document_ids": [101],
|
||||
"duplicate_document_ids": [],
|
||||
"total_files": 1,
|
||||
"pending_files": 1,
|
||||
"skipped_duplicates": 0,
|
||||
},
|
||||
)
|
||||
)
|
||||
client = DocumentsClient(http, _BASE)
|
||||
result = await client.upload(files=[f1], search_space_id=7)
|
||||
assert result.document_ids == [101]
|
||||
assert result.pending_files == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@respx.mock(base_url=_BASE)
|
||||
async def test_documents_list_chunks_paginated(respx_mock, http):
|
||||
respx_mock.get("/api/v1/documents/5/chunks").mock(
|
||||
side_effect=[
|
||||
httpx.Response(200, json={
|
||||
"items": [{"id": 1, "content": "a"}, {"id": 2, "content": "b"}],
|
||||
"total": 3, "page": 0, "page_size": 2, "has_more": True,
|
||||
}),
|
||||
httpx.Response(200, json={
|
||||
"items": [{"id": 3, "content": "c"}],
|
||||
"total": 3, "page": 1, "page_size": 2, "has_more": False,
|
||||
}),
|
||||
]
|
||||
)
|
||||
client = DocumentsClient(http, _BASE)
|
||||
rows = await client.list_chunks(5, page_size=2)
|
||||
assert [r.id for r in rows] == [1, 2, 3]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# NewChatClient
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@respx.mock(base_url=_BASE)
|
||||
async def test_create_thread_returns_id(respx_mock, http):
|
||||
respx_mock.post("/api/v1/threads").mock(
|
||||
return_value=httpx.Response(
|
||||
200,
|
||||
json={
|
||||
"id": 555,
|
||||
"title": "eval",
|
||||
"archived": False,
|
||||
"visibility": "PRIVATE",
|
||||
"search_space_id": 1,
|
||||
"messages": [],
|
||||
"created_at": "2026-05-11T00:00:00Z",
|
||||
"updated_at": "2026-05-11T00:00:00Z",
|
||||
},
|
||||
)
|
||||
)
|
||||
client = NewChatClient(http, _BASE)
|
||||
tid = await client.create_thread(search_space_id=1)
|
||||
assert tid == 555
|
||||
|
||||
|
||||
def _sse_body(events: list[dict]) -> bytes:
|
||||
parts = []
|
||||
for ev in events:
|
||||
parts.append(f"data: {json.dumps(ev)}\n\n")
|
||||
parts.append("data: [DONE]\n\n")
|
||||
return "".join(parts).encode("utf-8")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@respx.mock(base_url=_BASE)
|
||||
async def test_ask_accumulates_text_deltas(respx_mock, http):
|
||||
body = _sse_body([
|
||||
{"type": "start", "messageId": "m1"},
|
||||
{"type": "text-start", "id": "t1"},
|
||||
{"type": "text-delta", "id": "t1", "delta": "Answer "},
|
||||
{"type": "text-delta", "id": "t1", "delta": "is "},
|
||||
{"type": "text-delta", "id": "t1", "delta": "B [citation:42]."},
|
||||
{"type": "text-end", "id": "t1"},
|
||||
{"type": "finish"},
|
||||
])
|
||||
respx_mock.post("/api/v1/new_chat").mock(
|
||||
return_value=httpx.Response(
|
||||
200,
|
||||
content=body,
|
||||
headers={"Content-Type": "text/event-stream"},
|
||||
)
|
||||
)
|
||||
client = NewChatClient(http, _BASE)
|
||||
answer = await client.ask(
|
||||
thread_id=1, search_space_id=2, user_query="What is the answer?"
|
||||
)
|
||||
assert answer.text == "Answer is B [citation:42]."
|
||||
assert answer.finished_normally is True
|
||||
assert any(c["chunk_id"] == 42 for c in answer.citations)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@respx.mock(base_url=_BASE)
|
||||
async def test_ask_409_thread_busy_retries(respx_mock, http):
|
||||
body = _sse_body([
|
||||
{"type": "text-delta", "id": "t1", "delta": "ok"},
|
||||
{"type": "finish"},
|
||||
])
|
||||
busy = httpx.Response(
|
||||
409,
|
||||
json={"detail": {"errorCode": "THREAD_BUSY", "message": "busy"}},
|
||||
headers={"Retry-After": "1"},
|
||||
)
|
||||
success = httpx.Response(
|
||||
200, content=body, headers={"Content-Type": "text/event-stream"}
|
||||
)
|
||||
respx_mock.post("/api/v1/new_chat").mock(side_effect=[busy, success])
|
||||
client = NewChatClient(http, _BASE)
|
||||
answer = await client.ask(
|
||||
thread_id=1, search_space_id=2, user_query="hi", max_busy_retries=2
|
||||
)
|
||||
assert answer.text == "ok"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@respx.mock(base_url=_BASE)
|
||||
async def test_ask_409_exhausts_retries(respx_mock, http):
|
||||
busy = httpx.Response(
|
||||
409,
|
||||
json={"detail": {"errorCode": "TURN_CANCELLING", "message": "wait"}},
|
||||
headers={"Retry-After": "1"},
|
||||
)
|
||||
respx_mock.post("/api/v1/new_chat").mock(return_value=busy)
|
||||
client = NewChatClient(http, _BASE)
|
||||
with pytest.raises(ThreadBusyError):
|
||||
await client.ask(
|
||||
thread_id=1, search_space_id=2, user_query="hi", max_busy_retries=1
|
||||
)
|
||||
160
surfsense_evals/tests/core/test_config.py
Normal file
160
surfsense_evals/tests/core/test_config.py
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
"""Tests for env loading + state.json read/write."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
from surfsense_evals.core.config import (
|
||||
DEFAULT_SCENARIO,
|
||||
SCENARIOS,
|
||||
SuiteState,
|
||||
clear_suite_state,
|
||||
get_suite_state,
|
||||
load_config,
|
||||
set_suite_state,
|
||||
)
|
||||
|
||||
|
||||
def test_load_config_defaults_to_localhost(tmp_env): # noqa: ARG001
|
||||
config = load_config()
|
||||
assert config.surfsense_api_base == "http://localhost:8000"
|
||||
assert config.has_jwt_mode() is False
|
||||
assert config.has_local_mode() is False
|
||||
assert config.credential_mode() == "none"
|
||||
|
||||
|
||||
def test_load_config_picks_up_jwt_env(tmp_env, monkeypatch): # noqa: ARG001
|
||||
monkeypatch.setenv("SURFSENSE_JWT", "tok")
|
||||
config = load_config()
|
||||
assert config.credential_mode() == "jwt"
|
||||
|
||||
|
||||
def test_load_config_picks_up_local_env(tmp_env, monkeypatch): # noqa: ARG001
|
||||
monkeypatch.setenv("SURFSENSE_USER_EMAIL", "u@x.com")
|
||||
monkeypatch.setenv("SURFSENSE_USER_PASSWORD", "pw")
|
||||
config = load_config()
|
||||
assert config.credential_mode() == "local"
|
||||
|
||||
|
||||
def test_state_roundtrip_per_suite(tmp_env): # noqa: ARG001
|
||||
config = load_config()
|
||||
assert get_suite_state(config, "medical") is None
|
||||
state = SuiteState(
|
||||
search_space_id=1,
|
||||
agent_llm_id=-10042,
|
||||
provider_model="anthropic/claude-sonnet-4.5",
|
||||
created_at="2026-05-11T20-30-00Z",
|
||||
)
|
||||
set_suite_state(config, "medical", state)
|
||||
legal = SuiteState(
|
||||
search_space_id=2,
|
||||
agent_llm_id=-1,
|
||||
provider_model="openai/gpt-5",
|
||||
created_at="2026-05-11T21-00-00Z",
|
||||
)
|
||||
set_suite_state(config, "legal", legal)
|
||||
|
||||
fetched = get_suite_state(config, "medical")
|
||||
assert fetched.search_space_id == 1
|
||||
assert fetched.provider_model == "anthropic/claude-sonnet-4.5"
|
||||
|
||||
# Other suite untouched after teardown.
|
||||
cleared = clear_suite_state(config, "medical")
|
||||
assert cleared is True
|
||||
assert get_suite_state(config, "medical") is None
|
||||
assert get_suite_state(config, "legal").search_space_id == 2
|
||||
|
||||
raw = json.loads(config.state_path.read_text(encoding="utf-8"))
|
||||
assert "medical" not in raw["suites"]
|
||||
assert "legal" in raw["suites"]
|
||||
|
||||
|
||||
def test_paths_are_per_suite(tmp_env): # noqa: ARG001
|
||||
config = load_config()
|
||||
a = config.suite_data_dir("medical")
|
||||
b = config.suite_data_dir("legal")
|
||||
assert a != b
|
||||
assert config.suite_reports_dir("medical").parent == config.reports_dir
|
||||
assert config.suite_runs_dir("medical").name == "runs"
|
||||
assert config.suite_maps_dir("medical").name == "maps"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scenario state — back-compat + new fields
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_legacy_state_back_compat_defaults_to_head_to_head():
|
||||
"""state.json files written before scenarios shipped must still load.
|
||||
|
||||
Missing ``scenario`` / ``vision_*`` / ``native_arm_model`` keys all
|
||||
default to ``head-to-head`` / ``None`` so old setups keep working
|
||||
after upgrade — the runner's behaviour exactly mirrors the legacy
|
||||
one (both arms answer with ``provider_model``).
|
||||
"""
|
||||
|
||||
legacy = {
|
||||
"search_space_id": 7,
|
||||
"agent_llm_id": -123,
|
||||
"provider_model": "anthropic/claude-sonnet-4.5",
|
||||
"created_at": "2026-05-11T20-30-00Z",
|
||||
"ingestion_maps": {},
|
||||
}
|
||||
state = SuiteState.from_dict(legacy)
|
||||
assert state.scenario == DEFAULT_SCENARIO == "head-to-head"
|
||||
assert state.vision_llm_config_id is None
|
||||
assert state.vision_provider_model is None
|
||||
assert state.native_arm_model is None
|
||||
# The native arm should still answer with the same slug as SurfSense.
|
||||
assert state.effective_native_arm_model == state.provider_model
|
||||
|
||||
|
||||
def test_unknown_scenario_falls_back_to_default():
|
||||
"""Garbage scenario in state.json → default, not crash.
|
||||
|
||||
Defensive: we'd rather a stale state file render with the safe
|
||||
head-to-head behaviour than break the whole run with a KeyError.
|
||||
"""
|
||||
|
||||
payload = {
|
||||
"search_space_id": 1,
|
||||
"agent_llm_id": -1,
|
||||
"provider_model": "openai/gpt-5",
|
||||
"scenario": "unknown-scenario-name",
|
||||
}
|
||||
state = SuiteState.from_dict(payload)
|
||||
assert state.scenario == DEFAULT_SCENARIO
|
||||
|
||||
|
||||
def test_cost_arbitrage_state_persists_native_arm_model(tmp_env): # noqa: ARG001
|
||||
config = load_config()
|
||||
state = SuiteState(
|
||||
search_space_id=42,
|
||||
agent_llm_id=-1,
|
||||
provider_model="openai/gpt-5.4-mini",
|
||||
created_at="2026-05-11T20-30-00Z",
|
||||
scenario="cost-arbitrage",
|
||||
vision_llm_config_id=-101,
|
||||
vision_provider_model="anthropic/claude-sonnet-4.5",
|
||||
native_arm_model="anthropic/claude-sonnet-4.5",
|
||||
)
|
||||
set_suite_state(config, "medical", state)
|
||||
|
||||
fetched = get_suite_state(config, "medical")
|
||||
assert fetched.scenario == "cost-arbitrage"
|
||||
assert fetched.vision_llm_config_id == -101
|
||||
assert fetched.vision_provider_model == "anthropic/claude-sonnet-4.5"
|
||||
assert fetched.native_arm_model == "anthropic/claude-sonnet-4.5"
|
||||
# Cost arbitrage's whole point: native arm slug != surfsense slug.
|
||||
assert fetched.effective_native_arm_model != fetched.provider_model
|
||||
assert fetched.effective_native_arm_model == "anthropic/claude-sonnet-4.5"
|
||||
|
||||
raw = json.loads(config.state_path.read_text(encoding="utf-8"))
|
||||
assert raw["suites"]["medical"]["scenario"] == "cost-arbitrage"
|
||||
|
||||
|
||||
def test_scenario_constants_are_stable():
|
||||
"""Pin the public scenario list; runners + tests key off these strings."""
|
||||
|
||||
assert SCENARIOS == ("head-to-head", "symmetric-cheap", "cost-arbitrage")
|
||||
assert DEFAULT_SCENARIO == "head-to-head"
|
||||
269
surfsense_evals/tests/core/test_ingest_settings.py
Normal file
269
surfsense_evals/tests/core/test_ingest_settings.py
Normal file
|
|
@ -0,0 +1,269 @@
|
|||
"""Unit tests for ``surfsense_evals.core.ingest_settings``.
|
||||
|
||||
Covers:
|
||||
|
||||
* ``IngestSettings.merge`` honours operator overrides and falls back
|
||||
to per-benchmark defaults when the operator is silent.
|
||||
* ``add_ingest_settings_args`` exposes the three flag pairs and
|
||||
argparse defaults of ``None`` correctly distinguish "not passed"
|
||||
from "explicitly false".
|
||||
* ``settings_header_line`` / ``read_settings_header`` round-trip
|
||||
through a JSONL file.
|
||||
* ``read_settings_header`` is fault-tolerant: missing files, missing
|
||||
header, malformed JSON.
|
||||
* ``format_ingest_settings_md`` produces a stable Markdown bullet.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from surfsense_evals.core.ingest_settings import (
|
||||
PROCESSING_MODES,
|
||||
SETTINGS_HEADER_KEY,
|
||||
IngestSettings,
|
||||
add_ingest_settings_args,
|
||||
format_ingest_settings_md,
|
||||
is_settings_header,
|
||||
read_settings_header,
|
||||
settings_header_line,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# IngestSettings.merge
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestMerge:
|
||||
def test_silent_operator_uses_defaults(self) -> None:
|
||||
defaults = IngestSettings(use_vision_llm=True, processing_mode="basic", should_summarize=True)
|
||||
merged = IngestSettings.merge(defaults, {})
|
||||
assert merged == defaults
|
||||
|
||||
def test_explicit_false_overrides_default_true(self) -> None:
|
||||
defaults = IngestSettings(use_vision_llm=True)
|
||||
merged = IngestSettings.merge(
|
||||
defaults, {"use_vision_llm": False}
|
||||
)
|
||||
assert merged.use_vision_llm is False
|
||||
|
||||
def test_explicit_true_overrides_default_false(self) -> None:
|
||||
defaults = IngestSettings(use_vision_llm=False)
|
||||
merged = IngestSettings.merge(
|
||||
defaults, {"use_vision_llm": True}
|
||||
)
|
||||
assert merged.use_vision_llm is True
|
||||
|
||||
def test_none_means_silent(self) -> None:
|
||||
# Argparse with BooleanOptionalAction yields None when the
|
||||
# operator passed neither --use-vision-llm nor --no-vision-llm.
|
||||
defaults = IngestSettings(use_vision_llm=True)
|
||||
merged = IngestSettings.merge(
|
||||
defaults, {"use_vision_llm": None}
|
||||
)
|
||||
assert merged.use_vision_llm is True
|
||||
|
||||
def test_processing_mode_override(self) -> None:
|
||||
defaults = IngestSettings(processing_mode="basic")
|
||||
merged = IngestSettings.merge(
|
||||
defaults, {"processing_mode": "premium"}
|
||||
)
|
||||
assert merged.processing_mode == "premium"
|
||||
|
||||
def test_processing_mode_invalid_raises(self) -> None:
|
||||
defaults = IngestSettings(processing_mode="basic")
|
||||
with pytest.raises(ValueError, match="Invalid processing_mode"):
|
||||
IngestSettings.merge(defaults, {"processing_mode": "exotic"})
|
||||
|
||||
def test_processing_mode_blank_falls_back(self) -> None:
|
||||
defaults = IngestSettings(processing_mode="basic")
|
||||
merged = IngestSettings.merge(defaults, {"processing_mode": ""})
|
||||
assert merged.processing_mode == "basic"
|
||||
|
||||
def test_string_truthy_coerced(self) -> None:
|
||||
defaults = IngestSettings(use_vision_llm=False)
|
||||
merged = IngestSettings.merge(defaults, {"use_vision_llm": "yes"})
|
||||
assert merged.use_vision_llm is True
|
||||
|
||||
def test_string_falsy_coerced(self) -> None:
|
||||
defaults = IngestSettings(use_vision_llm=True)
|
||||
merged = IngestSettings.merge(defaults, {"use_vision_llm": "false"})
|
||||
assert merged.use_vision_llm is False
|
||||
|
||||
def test_other_keys_ignored(self) -> None:
|
||||
# Benchmarks pass the whole opts dict; merge must tolerate
|
||||
# unrelated keys without crashing.
|
||||
defaults = IngestSettings(use_vision_llm=True, processing_mode="basic")
|
||||
merged = IngestSettings.merge(
|
||||
defaults,
|
||||
{
|
||||
"use_vision_llm": False,
|
||||
"concurrency": 4,
|
||||
"task_filter": "all",
|
||||
"no_mentions": True,
|
||||
},
|
||||
)
|
||||
assert merged.use_vision_llm is False
|
||||
assert merged.processing_mode == "basic"
|
||||
|
||||
def test_to_dict_round_trips(self) -> None:
|
||||
s = IngestSettings(use_vision_llm=True, processing_mode="premium", should_summarize=False)
|
||||
d = s.to_dict()
|
||||
assert d == {
|
||||
"use_vision_llm": True,
|
||||
"processing_mode": "premium",
|
||||
"should_summarize": False,
|
||||
}
|
||||
|
||||
def test_render_label_format(self) -> None:
|
||||
s = IngestSettings(use_vision_llm=True, processing_mode="premium", should_summarize=True)
|
||||
assert s.render_label() == "vision=on, mode=premium, summarize=on"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# add_ingest_settings_args
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestAddArgs:
|
||||
@pytest.fixture
|
||||
def parser(self) -> argparse.ArgumentParser:
|
||||
p = argparse.ArgumentParser()
|
||||
add_ingest_settings_args(
|
||||
p,
|
||||
defaults=IngestSettings(
|
||||
use_vision_llm=False, processing_mode="basic", should_summarize=False
|
||||
),
|
||||
)
|
||||
return p
|
||||
|
||||
def test_silent_invocation_yields_none(self, parser: argparse.ArgumentParser) -> None:
|
||||
args = parser.parse_args([])
|
||||
assert args.use_vision_llm is None
|
||||
assert args.processing_mode is None
|
||||
assert args.should_summarize is None
|
||||
|
||||
def test_use_vision_llm_flag(self, parser: argparse.ArgumentParser) -> None:
|
||||
args = parser.parse_args(["--use-vision-llm"])
|
||||
assert args.use_vision_llm is True
|
||||
|
||||
def test_no_vision_llm_flag(self, parser: argparse.ArgumentParser) -> None:
|
||||
args = parser.parse_args(["--no-vision-llm"])
|
||||
assert args.use_vision_llm is False
|
||||
|
||||
def test_processing_mode_choices(self, parser: argparse.ArgumentParser) -> None:
|
||||
for mode in PROCESSING_MODES:
|
||||
args = parser.parse_args(["--processing-mode", mode])
|
||||
assert args.processing_mode == mode
|
||||
|
||||
def test_processing_mode_rejects_unknown(
|
||||
self, parser: argparse.ArgumentParser
|
||||
) -> None:
|
||||
with pytest.raises(SystemExit):
|
||||
parser.parse_args(["--processing-mode", "exotic"])
|
||||
|
||||
def test_summarize_flag_pair(self, parser: argparse.ArgumentParser) -> None:
|
||||
on = parser.parse_args(["--should-summarize"])
|
||||
assert on.should_summarize is True
|
||||
off = parser.parse_args(["--no-summarize"])
|
||||
assert off.should_summarize is False
|
||||
|
||||
def test_vision_flags_mutually_exclusive(
|
||||
self, parser: argparse.ArgumentParser
|
||||
) -> None:
|
||||
with pytest.raises(SystemExit):
|
||||
parser.parse_args(["--use-vision-llm", "--no-vision-llm"])
|
||||
|
||||
def test_full_pipeline(self, parser: argparse.ArgumentParser) -> None:
|
||||
# Operator passes flags + defaults are reasonable. Merge
|
||||
# should yield exactly what they asked for.
|
||||
args = parser.parse_args(
|
||||
["--use-vision-llm", "--processing-mode", "premium"]
|
||||
)
|
||||
defaults = IngestSettings(
|
||||
use_vision_llm=False, processing_mode="basic", should_summarize=False
|
||||
)
|
||||
merged = IngestSettings.merge(defaults, vars(args))
|
||||
assert merged == IngestSettings(
|
||||
use_vision_llm=True, processing_mode="premium", should_summarize=False
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header round-trip + read_settings_header fault tolerance
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestHeader:
|
||||
def test_header_line_round_trip(self, tmp_path: Path) -> None:
|
||||
s = IngestSettings(use_vision_llm=True, processing_mode="premium")
|
||||
path = tmp_path / "map.jsonl"
|
||||
with path.open("w", encoding="utf-8") as fh:
|
||||
fh.write(settings_header_line(s) + "\n")
|
||||
fh.write(json.dumps({"case_id": "x", "document_id": 1}) + "\n")
|
||||
loaded = read_settings_header(path)
|
||||
assert loaded == s.to_dict()
|
||||
|
||||
def test_is_settings_header_recognises(self) -> None:
|
||||
assert is_settings_header({SETTINGS_HEADER_KEY: {}})
|
||||
assert not is_settings_header({"case_id": "x"})
|
||||
|
||||
def test_missing_file_returns_empty(self, tmp_path: Path) -> None:
|
||||
assert read_settings_header(tmp_path / "does_not_exist.jsonl") == {}
|
||||
|
||||
def test_empty_file_returns_empty(self, tmp_path: Path) -> None:
|
||||
path = tmp_path / "empty.jsonl"
|
||||
path.write_text("", encoding="utf-8")
|
||||
assert read_settings_header(path) == {}
|
||||
|
||||
def test_no_header_returns_empty(self, tmp_path: Path) -> None:
|
||||
path = tmp_path / "legacy.jsonl"
|
||||
with path.open("w", encoding="utf-8") as fh:
|
||||
fh.write(json.dumps({"case_id": "x", "document_id": 1}) + "\n")
|
||||
fh.write(json.dumps({"case_id": "y", "document_id": 2}) + "\n")
|
||||
assert read_settings_header(path) == {}
|
||||
|
||||
def test_malformed_json_returns_empty(self, tmp_path: Path) -> None:
|
||||
path = tmp_path / "broken.jsonl"
|
||||
path.write_text("not json\n", encoding="utf-8")
|
||||
assert read_settings_header(path) == {}
|
||||
|
||||
def test_skips_blank_first_lines(self, tmp_path: Path) -> None:
|
||||
s = IngestSettings(use_vision_llm=True)
|
||||
path = tmp_path / "padded.jsonl"
|
||||
with path.open("w", encoding="utf-8") as fh:
|
||||
fh.write("\n\n")
|
||||
fh.write(settings_header_line(s) + "\n")
|
||||
assert read_settings_header(path) == s.to_dict()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# format_ingest_settings_md
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestFormatMd:
|
||||
def test_full_settings(self) -> None:
|
||||
out = format_ingest_settings_md(
|
||||
{"use_vision_llm": True, "processing_mode": "premium", "should_summarize": True}
|
||||
)
|
||||
assert "vision_llm=`on`" in out
|
||||
assert "processing_mode=`premium`" in out
|
||||
assert "summarize=`on`" in out
|
||||
|
||||
def test_default_off(self) -> None:
|
||||
out = format_ingest_settings_md(
|
||||
{"use_vision_llm": False, "processing_mode": "basic", "should_summarize": False}
|
||||
)
|
||||
assert "vision_llm=`off`" in out
|
||||
assert "processing_mode=`basic`" in out
|
||||
assert "summarize=`off`" in out
|
||||
|
||||
def test_missing_returns_re_ingest_hint(self) -> None:
|
||||
# Empty dict + None + non-mapping should all degrade gracefully.
|
||||
for raw in [None, {}, "not-a-mapping"]:
|
||||
assert "(not recorded" in format_ingest_settings_md(raw)
|
||||
153
surfsense_evals/tests/core/test_metrics.py
Normal file
153
surfsense_evals/tests/core/test_metrics.py
Normal file
|
|
@ -0,0 +1,153 @@
|
|||
"""Metric correctness — Wilson, McNemar, retrieval scores."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
|
||||
import pytest
|
||||
|
||||
from surfsense_evals.core.metrics import (
|
||||
accuracy_with_wilson_ci,
|
||||
bootstrap_delta_ci,
|
||||
mcnemar_test,
|
||||
mrr,
|
||||
ndcg_at_k,
|
||||
recall_at_k,
|
||||
score_run,
|
||||
wilson_ci,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Wilson
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"k,n,low,high",
|
||||
[
|
||||
(80, 100, 0.7111, 0.8666), # cross-checked vs statsmodels.proportion_confint(method='wilson')
|
||||
(50, 100, 0.4038, 0.5962),
|
||||
(0, 0, 0.0, 1.0),
|
||||
(0, 10, 0.0, 0.2775),
|
||||
(10, 10, 0.7225, 1.0),
|
||||
],
|
||||
)
|
||||
def test_wilson_ci_known_values(k, n, low, high):
|
||||
result_low, result_high = wilson_ci(k, n)
|
||||
assert math.isclose(result_low, low, abs_tol=5e-4), (k, n, result_low, low)
|
||||
assert math.isclose(result_high, high, abs_tol=5e-4), (k, n, result_high, high)
|
||||
|
||||
|
||||
def test_accuracy_with_wilson_ci_object():
|
||||
res = accuracy_with_wilson_ci(70, 100)
|
||||
assert res.accuracy == 0.7
|
||||
assert 0.0 < res.ci_low < res.ci_high < 1.0
|
||||
|
||||
|
||||
def test_invalid_inputs_raise():
|
||||
with pytest.raises(ValueError):
|
||||
accuracy_with_wilson_ci(-1, 10)
|
||||
with pytest.raises(ValueError):
|
||||
accuracy_with_wilson_ci(11, 10)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# McNemar
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_mcnemar_degenerate_returns_p_value_one():
|
||||
a = [True, True, False, False]
|
||||
b = [True, True, False, False]
|
||||
res = mcnemar_test(a, b)
|
||||
assert res.b == 0 and res.c == 0
|
||||
assert res.p_value == 1.0
|
||||
assert res.method == "degenerate"
|
||||
|
||||
|
||||
def test_mcnemar_exact_branch_strong_signal():
|
||||
"""B = 0, C = 10 → exact two-sided binomial p == 2 * (1/2)**10."""
|
||||
|
||||
a = [True] * 10 + [False] * 10
|
||||
b = [True] * 10 + [True] * 10 # surfsense beats native on the 10 native-wrong
|
||||
res = mcnemar_test(a, b)
|
||||
assert res.b == 0
|
||||
assert res.c == 10
|
||||
assert res.method == "exact"
|
||||
expected = 2 * (0.5 ** 10)
|
||||
assert math.isclose(res.p_value, expected, rel_tol=1e-9)
|
||||
|
||||
|
||||
def test_mcnemar_chi_square_approx_for_large_discordant():
|
||||
# Construct b=15, c=5 with continuity-corrected chi^2 = (|10|-1)^2/20 = 4.05.
|
||||
a = [True] * 15 + [False] * 5 + [True] * 30 + [False] * 30
|
||||
b = [False] * 15 + [True] * 5 + [True] * 30 + [False] * 30
|
||||
res = mcnemar_test(a, b)
|
||||
assert res.method == "chi2_cc"
|
||||
assert res.b == 15 and res.c == 5
|
||||
assert math.isclose(res.statistic, ((abs(15 - 5) - 1) ** 2) / 20.0, rel_tol=1e-9)
|
||||
# p ≈ chi2.sf(4.05, df=1) ≈ 0.04417
|
||||
assert 0.04 < res.p_value < 0.05
|
||||
|
||||
|
||||
def test_mcnemar_length_mismatch():
|
||||
with pytest.raises(ValueError):
|
||||
mcnemar_test([True], [True, False])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bootstrap
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_bootstrap_delta_ci_shape_and_determinism():
|
||||
a = [True, True, False, True, False, False, True, True]
|
||||
b = [True, True, True, True, True, False, True, False]
|
||||
res1 = bootstrap_delta_ci(a, b, n_resamples=500, random_state=42)
|
||||
res2 = bootstrap_delta_ci(a, b, n_resamples=500, random_state=42)
|
||||
assert res1.delta == res2.delta
|
||||
assert res1.ci_low == res2.ci_low
|
||||
assert res1.ci_high == res2.ci_high
|
||||
assert res1.ci_low <= res1.delta <= res1.ci_high
|
||||
assert res1.n_resamples == 500
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Retrieval
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_recall_at_k():
|
||||
retrieved = ["a", "b", "c", "d"]
|
||||
relevant = ["b", "d", "z"]
|
||||
assert recall_at_k(retrieved, relevant, k=2) == pytest.approx(1 / 3)
|
||||
assert recall_at_k(retrieved, relevant, k=4) == pytest.approx(2 / 3)
|
||||
|
||||
|
||||
def test_mrr():
|
||||
assert mrr(["a", "b", "c"], ["c"]) == pytest.approx(1 / 3)
|
||||
assert mrr(["x", "y"], ["z"]) == 0.0
|
||||
|
||||
|
||||
def test_ndcg_at_k_perfect_order():
|
||||
qrels = {"a": 2, "b": 1}
|
||||
assert ndcg_at_k(["a", "b"], qrels, k=2) == pytest.approx(1.0)
|
||||
|
||||
|
||||
def test_ndcg_at_k_irrelevant_first():
|
||||
qrels = {"a": 2, "b": 1}
|
||||
# Wrong order should still be > 0 but < 1
|
||||
val = ndcg_at_k(["c", "a", "b"], qrels, k=3)
|
||||
assert 0 < val < 1
|
||||
|
||||
|
||||
def test_score_run_aggregates_across_queries():
|
||||
scores = score_run(
|
||||
per_query_retrieved={"q1": ["a", "b"], "q2": ["x", "y", "z"]},
|
||||
per_query_qrels={"q1": {"a": 1}, "q2": {"z": 2}},
|
||||
ks=(1, 5),
|
||||
ndcg_k=5,
|
||||
)
|
||||
assert scores.n_queries == 2
|
||||
assert scores.recall_at_k[1] == pytest.approx((1 + 0) / 2) # q1 hits @1, q2 doesn't
|
||||
assert scores.mrr == pytest.approx((1.0 + 1 / 3) / 2)
|
||||
27
surfsense_evals/tests/core/test_parse_answer_letter.py
Normal file
27
surfsense_evals/tests/core/test_parse_answer_letter.py
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
"""Tests for the MCQ answer-letter extractor."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from surfsense_evals.core.parse import extract_answer_letter
|
||||
from surfsense_evals.core.parse.answer_letter import AnswerLetterResult
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,expected_letter,expected_strategy",
|
||||
[
|
||||
('```json\n{"step_by_step_thinking": "...", "answer_choice": "B"}\n```', "B", "json_envelope"),
|
||||
('Reasoning... {"step_by_step_thinking": "x", "answer_choice": "C"}', "C", "json_envelope"),
|
||||
("Long reasoning.\nAnswer: D", "D", "answer_line"),
|
||||
("The correct answer is (A).", "A", "answer_line"),
|
||||
("Final answer: e", "E", "answer_line"),
|
||||
("Long reasoning.\n\nB", "B", "bare_letter"),
|
||||
("Long reasoning.\n\n(C).", "C", "bare_letter"),
|
||||
("", None, "none"),
|
||||
("Just narrative without an answer.", None, "none"),
|
||||
],
|
||||
)
|
||||
def test_extract_answer_letter(text, expected_letter, expected_strategy):
|
||||
result = extract_answer_letter(text)
|
||||
assert result == AnswerLetterResult(expected_letter, expected_strategy)
|
||||
108
surfsense_evals/tests/core/test_parse_citations.py
Normal file
108
surfsense_evals/tests/core/test_parse_citations.py
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
"""Parity tests for the citation regex.
|
||||
|
||||
Each row mirrors a case from the canonical TS reference at
|
||||
``surfsense_web/lib/citations/citation-parser.ts``. If a future PR
|
||||
loosens or tightens the TS regex, these tests will start failing;
|
||||
that's the explicit signal to re-port the change.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from surfsense_evals.core.parse import (
|
||||
CITATION_REGEX,
|
||||
ChunkCitation,
|
||||
UrlCitation,
|
||||
parse_citations,
|
||||
)
|
||||
|
||||
PARITY_TABLE = [
|
||||
# (input, expected number of matches, expected first-token kind/value)
|
||||
("Plain text with no citation.", 0, None),
|
||||
(
|
||||
"The patient has fever [citation:42] and cough.",
|
||||
1,
|
||||
ChunkCitation(chunk_id=42, is_docs_chunk=False),
|
||||
),
|
||||
(
|
||||
"Negative chunk ids work [citation:-7].",
|
||||
1,
|
||||
ChunkCitation(chunk_id=-7, is_docs_chunk=False),
|
||||
),
|
||||
(
|
||||
"doc-prefix [citation:doc-12].",
|
||||
1,
|
||||
ChunkCitation(chunk_id=12, is_docs_chunk=True),
|
||||
),
|
||||
(
|
||||
"Multi id [citation:1, doc-2, -3].",
|
||||
3,
|
||||
ChunkCitation(chunk_id=1, is_docs_chunk=False),
|
||||
),
|
||||
(
|
||||
"URL form [citation:https://x.com/a].",
|
||||
1,
|
||||
UrlCitation(url="https://x.com/a"),
|
||||
),
|
||||
(
|
||||
"Chinese brackets【citation:5】.",
|
||||
1,
|
||||
ChunkCitation(chunk_id=5, is_docs_chunk=False),
|
||||
),
|
||||
(
|
||||
"ZWSP-decorated [\u200bcitation:9\u200b].",
|
||||
1,
|
||||
ChunkCitation(chunk_id=9, is_docs_chunk=False),
|
||||
),
|
||||
(
|
||||
"Whitespace [citation: doc-100 ] tolerated.",
|
||||
1,
|
||||
ChunkCitation(chunk_id=100, is_docs_chunk=True),
|
||||
),
|
||||
(
|
||||
# The TS regex's URL char class excludes ']', so a trailing
|
||||
# bracket isn't swallowed.
|
||||
"Two URLs [citation:https://a.io] and [citation:https://b.io].",
|
||||
2,
|
||||
UrlCitation(url="https://a.io"),
|
||||
),
|
||||
(
|
||||
# Garbled form should match nothing.
|
||||
"Citation-like but wrong [citation:].",
|
||||
0,
|
||||
None,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,n_expected,first", PARITY_TABLE)
|
||||
def test_citation_regex_parity(text: str, n_expected: int, first):
|
||||
tokens = parse_citations(text)
|
||||
assert len(tokens) == n_expected, (text, tokens)
|
||||
if first is not None:
|
||||
assert tokens[0] == first, (text, tokens)
|
||||
|
||||
|
||||
def test_regex_pattern_matches_ts_source():
|
||||
"""Sanity: the compiled pattern carries the exact alternatives the TS source does."""
|
||||
|
||||
pattern = CITATION_REGEX.pattern
|
||||
assert "https?://" in pattern
|
||||
assert "urlcite" in pattern
|
||||
assert "doc-" in pattern
|
||||
assert "\u200B" in pattern
|
||||
assert "【" in pattern and "】" in pattern
|
||||
|
||||
|
||||
def test_url_map_resolution():
|
||||
text = "Inline placeholder [citation:urlcite0]."
|
||||
tokens = parse_citations(text, url_map={"urlcite0": "https://resolved.example/x"})
|
||||
assert tokens == [UrlCitation(url="https://resolved.example/x")]
|
||||
|
||||
|
||||
def test_url_map_missing_key_drops_token():
|
||||
"""Missing urlcite resolution returns no token (TS behaviour)."""
|
||||
|
||||
text = "[citation:urlcite99]"
|
||||
assert parse_citations(text, url_map={}) == []
|
||||
73
surfsense_evals/tests/core/test_parse_freeform_answer.py
Normal file
73
surfsense_evals/tests/core/test_parse_freeform_answer.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
"""Tests for ``surfsense_evals.core.parse.freeform_answer``."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from surfsense_evals.core.parse.freeform_answer import extract_freeform_answer
|
||||
|
||||
|
||||
class TestExtractFreeformAnswer:
|
||||
def test_empty_string_returns_empty(self) -> None:
|
||||
assert extract_freeform_answer("") == ""
|
||||
assert extract_freeform_answer(" \n\n ") == ""
|
||||
|
||||
def test_simple_answer_marker(self) -> None:
|
||||
assert extract_freeform_answer("Answer: 42") == "42"
|
||||
|
||||
def test_final_answer_marker(self) -> None:
|
||||
assert extract_freeform_answer("Final answer: Paris") == "Paris"
|
||||
|
||||
def test_the_answer_is_marker(self) -> None:
|
||||
assert extract_freeform_answer("The answer is: not answerable") == "not answerable"
|
||||
|
||||
def test_multiline_picks_last_answer_marker(self) -> None:
|
||||
text = "Let me think...\nAnswer: 5\nAnswer: 7\n"
|
||||
assert extract_freeform_answer(text) == "7"
|
||||
|
||||
def test_falls_back_to_last_nonempty_line(self) -> None:
|
||||
text = "Some thinking here.\n\n42"
|
||||
assert extract_freeform_answer(text) == "42"
|
||||
|
||||
def test_strips_quotes(self) -> None:
|
||||
assert extract_freeform_answer('Answer: "Paris"') == "Paris"
|
||||
assert extract_freeform_answer("Answer: 'Paris'") == "Paris"
|
||||
|
||||
def test_strips_backticks(self) -> None:
|
||||
assert extract_freeform_answer("Answer: `42`") == "42"
|
||||
|
||||
def test_uses_fenced_block_when_no_marker(self) -> None:
|
||||
text = "Here's my response:\n```\nfinal value\n```\n"
|
||||
assert extract_freeform_answer(text) == "final value"
|
||||
|
||||
def test_case_insensitive_markers(self) -> None:
|
||||
assert extract_freeform_answer("ANSWER: yes") == "yes"
|
||||
assert extract_freeform_answer("answer: no") == "no"
|
||||
|
||||
@pytest.mark.parametrize("text,expected", [
|
||||
("Answer: 1, 2, 3", "1, 2, 3"),
|
||||
("Answer: 3.14", "3.14"),
|
||||
("Answer: spaced ", "spaced"),
|
||||
])
|
||||
def test_various_payloads(self, text: str, expected: str) -> None:
|
||||
assert extract_freeform_answer(text) == expected
|
||||
|
||||
def test_inline_answer_after_thinking_trace(self) -> None:
|
||||
# Agent replies sometimes glue their thinking onto the same
|
||||
# line as the final "Answer: ..." marker (no newline before it).
|
||||
# The line-anchored regex misses this; the inline fallback
|
||||
# should still extract the right value.
|
||||
text = (
|
||||
"Need the Charlotte Bronte book title/year and the rank "
|
||||
"for a 128-foot NYC building.Answer: 128th"
|
||||
)
|
||||
assert extract_freeform_answer(text) == "128th"
|
||||
|
||||
def test_inline_picks_last_inline_answer(self) -> None:
|
||||
text = "Thought: maybe Answer: 5 is right? Actually Answer: 7."
|
||||
assert extract_freeform_answer(text) == "7."
|
||||
|
||||
def test_inline_does_not_override_proper_marker(self) -> None:
|
||||
# When a clean line-anchored "Answer: ..." exists, that wins.
|
||||
text = "Some preamble.Answer: 99\nAnswer: 42"
|
||||
assert extract_freeform_answer(text) == "42"
|
||||
84
surfsense_evals/tests/core/test_parse_sse.py
Normal file
84
surfsense_evals/tests/core/test_parse_sse.py
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
"""Tests for the SSE consumer."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from surfsense_evals.core.parse import iter_sse_events
|
||||
|
||||
|
||||
async def _alist(it):
|
||||
out = []
|
||||
async for x in it:
|
||||
out.append(x)
|
||||
return out
|
||||
|
||||
|
||||
async def _astream(lines):
|
||||
for line in lines:
|
||||
yield line
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_data_frame():
|
||||
events = await _alist(
|
||||
iter_sse_events(_astream([
|
||||
'data: {"type": "text-delta", "delta": "hi"}',
|
||||
"",
|
||||
'data: {"type": "finish"}',
|
||||
"",
|
||||
]))
|
||||
)
|
||||
assert [e.data for e in events] == [
|
||||
'{"type": "text-delta", "delta": "hi"}',
|
||||
'{"type": "finish"}',
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_done_sentinel_passes_through():
|
||||
events = await _alist(
|
||||
iter_sse_events(_astream([
|
||||
"data: [DONE]",
|
||||
"",
|
||||
]))
|
||||
)
|
||||
assert [e.data for e in events] == ["[DONE]"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multiline_data_joins_with_newline():
|
||||
events = await _alist(
|
||||
iter_sse_events(_astream([
|
||||
"data: line1",
|
||||
"data: line2",
|
||||
"",
|
||||
]))
|
||||
)
|
||||
assert events[0].data == "line1\nline2"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_comments_and_other_fields_ignored():
|
||||
events = await _alist(
|
||||
iter_sse_events(_astream([
|
||||
": heartbeat",
|
||||
"event: foo",
|
||||
"id: 123",
|
||||
"data: payload",
|
||||
"",
|
||||
]))
|
||||
)
|
||||
assert [e.data for e in events] == ["payload"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_handles_missing_trailing_blank():
|
||||
"""Some servers omit the final blank line; the consumer should still emit."""
|
||||
|
||||
events = await _alist(
|
||||
iter_sse_events(_astream([
|
||||
"data: only-one",
|
||||
]))
|
||||
)
|
||||
assert [e.data for e in events] == ["only-one"]
|
||||
51
surfsense_evals/tests/core/test_pdf_render.py
Normal file
51
surfsense_evals/tests/core/test_pdf_render.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
"""Smoke tests for PDF rendering.
|
||||
|
||||
We don't pull a full PDF parser into the test deps; the assertions
|
||||
are bytes-level (``%PDF`` magic, deterministic CreationDate scrub).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from surfsense_evals.core.pdf import render_pdf, render_text_files_to_pdf
|
||||
|
||||
|
||||
def test_render_pdf_writes_pdf_with_magic(tmp_path: Path):
|
||||
out = tmp_path / "out.pdf"
|
||||
rendered = render_pdf(
|
||||
title="Test",
|
||||
sections=[("intro", "Hello world."), ("body", "Line one.\nLine two.")],
|
||||
output_path=out,
|
||||
)
|
||||
assert rendered.path == out
|
||||
assert out.exists()
|
||||
assert out.read_bytes().startswith(b"%PDF-")
|
||||
|
||||
|
||||
def test_render_pdf_deterministic_dates(tmp_path: Path):
|
||||
out_a = tmp_path / "a.pdf"
|
||||
out_b = tmp_path / "b.pdf"
|
||||
sections = [("only", "deterministic body content")]
|
||||
render_pdf(title="Det", sections=sections, output_path=out_a)
|
||||
render_pdf(title="Det", sections=sections, output_path=out_b)
|
||||
# CreationDate / ModDate are scrubbed to a fixed value, so the two
|
||||
# files should compare equal (modulo any other internal randomness
|
||||
# — reportlab's basic outputs are deterministic given fixed inputs).
|
||||
assert out_a.read_bytes() == out_b.read_bytes()
|
||||
|
||||
|
||||
def test_render_text_files_uses_filename_as_section(tmp_path: Path):
|
||||
files_dir = tmp_path / "src"
|
||||
files_dir.mkdir()
|
||||
(files_dir / "admission_note.txt").write_text("history of present illness", encoding="utf-8")
|
||||
(files_dir / "labs.txt").write_text("Na 138, K 4.0", encoding="utf-8")
|
||||
out = tmp_path / "case.pdf"
|
||||
rendered = render_text_files_to_pdf(
|
||||
title="Case 1",
|
||||
files=[files_dir / "admission_note.txt", files_dir / "labs.txt"],
|
||||
output_path=out,
|
||||
)
|
||||
assert out.exists()
|
||||
# We don't decode the PDF; the n_chars estimate should reflect both inputs.
|
||||
assert rendered.n_chars >= len("history of present illness") + len("Na 138, K 4.0")
|
||||
73
surfsense_evals/tests/core/test_pdf_render_with_images.py
Normal file
73
surfsense_evals/tests/core/test_pdf_render_with_images.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
"""Tests for ``render_pdf_with_images`` — covers image embedding +
|
||||
deterministic byte output, mirroring ``test_pdf_render.py`` for the
|
||||
text-only path.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from surfsense_evals.core.pdf import PdfImage, render_pdf_with_images
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tiny_png(tmp_path: Path) -> Path:
|
||||
"""Generate a real 4x4 PNG via Pillow — embeds cleanly in reportlab.
|
||||
|
||||
Hand-crafted PNG headers tend to fail PIL's strict decoder, so we
|
||||
delegate to Pillow which is already a transitive dep of reportlab.
|
||||
"""
|
||||
|
||||
from PIL import Image as PILImage
|
||||
|
||||
p = tmp_path / "pixel.png"
|
||||
PILImage.new("RGB", (4, 4), color=(128, 128, 128)).save(p, format="PNG")
|
||||
return p
|
||||
|
||||
|
||||
class TestRenderPdfWithImages:
|
||||
def test_renders_pdf_with_no_images(self, tmp_path: Path) -> None:
|
||||
out = tmp_path / "out.pdf"
|
||||
rendered = render_pdf_with_images(
|
||||
title="Test",
|
||||
sections=[("Heading", "Body text here.", None)],
|
||||
output_path=out,
|
||||
)
|
||||
assert rendered.path == out
|
||||
assert out.exists()
|
||||
assert out.read_bytes().startswith(b"%PDF-")
|
||||
|
||||
def test_renders_pdf_with_one_image(self, tmp_path: Path, tiny_png: Path) -> None:
|
||||
out = tmp_path / "out.pdf"
|
||||
render_pdf_with_images(
|
||||
title="Test",
|
||||
sections=[("Case", "Body text.", [PdfImage(path=tiny_png, caption="A pixel")])],
|
||||
output_path=out,
|
||||
)
|
||||
assert out.exists()
|
||||
assert out.stat().st_size > 200 # not empty
|
||||
|
||||
def test_deterministic_bytes(self, tmp_path: Path, tiny_png: Path) -> None:
|
||||
out_a = tmp_path / "a.pdf"
|
||||
out_b = tmp_path / "b.pdf"
|
||||
sections = [
|
||||
("Case", "Some text.", [PdfImage(path=tiny_png, caption="cap")]),
|
||||
("Options", "A) one\nB) two", None),
|
||||
]
|
||||
render_pdf_with_images(title="Test", sections=sections, output_path=out_a)
|
||||
render_pdf_with_images(title="Test", sections=sections, output_path=out_b)
|
||||
assert out_a.read_bytes() == out_b.read_bytes()
|
||||
|
||||
def test_skips_invalid_image_silently(self, tmp_path: Path) -> None:
|
||||
"""A bad image path should not abort the whole PDF render."""
|
||||
|
||||
out = tmp_path / "out.pdf"
|
||||
render_pdf_with_images(
|
||||
title="Test",
|
||||
sections=[("Case", "Text", [PdfImage(path=tmp_path / "nope.jpg", caption="x")])],
|
||||
output_path=out,
|
||||
)
|
||||
assert out.exists()
|
||||
assert out.read_bytes().startswith(b"%PDF-")
|
||||
121
surfsense_evals/tests/core/test_provider_openrouter.py
Normal file
121
surfsense_evals/tests/core/test_provider_openrouter.py
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
"""respx-mocked tests for the OpenRouter PDF provider."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import respx
|
||||
|
||||
from surfsense_evals.core.providers.openrouter_pdf import (
|
||||
OpenRouterPdfProvider,
|
||||
PdfEngine,
|
||||
)
|
||||
|
||||
_BASE = "https://openrouter.test"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tiny_pdf(tmp_path: Path) -> Path:
|
||||
p = tmp_path / "case.pdf"
|
||||
p.write_bytes(b"%PDF-1.4 minimal content")
|
||||
return p
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@respx.mock(base_url=_BASE)
|
||||
async def test_payload_shape_matches_openrouter_docs(respx_mock, tiny_pdf: Path):
|
||||
captured = {}
|
||||
|
||||
def _capture(request):
|
||||
captured["body"] = json.loads(request.content)
|
||||
captured["headers"] = dict(request.headers)
|
||||
return httpx.Response(
|
||||
200,
|
||||
json={
|
||||
"choices": [{
|
||||
"message": {"content": "Answer: B"},
|
||||
"finish_reason": "stop",
|
||||
}],
|
||||
"usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15, "cost": 0.0001},
|
||||
},
|
||||
)
|
||||
|
||||
respx_mock.post("/chat/completions").mock(side_effect=_capture)
|
||||
|
||||
provider = OpenRouterPdfProvider(
|
||||
api_key="sk-or-test",
|
||||
base_url=_BASE,
|
||||
model="anthropic/claude-sonnet-4.5",
|
||||
engine=PdfEngine.NATIVE,
|
||||
)
|
||||
response = await provider.complete(prompt="What is the diagnosis?", pdf_path=tiny_pdf)
|
||||
body = captured["body"]
|
||||
assert body["model"] == "anthropic/claude-sonnet-4.5"
|
||||
assert body["plugins"] == [{"id": "file-parser", "pdf": {"engine": "native"}}]
|
||||
user = body["messages"][-1]
|
||||
assert user["role"] == "user"
|
||||
file_part = user["content"][0]
|
||||
assert file_part["type"] == "file"
|
||||
assert file_part["file"]["filename"] == tiny_pdf.name
|
||||
assert file_part["file"]["file_data"].startswith("data:application/pdf;base64,")
|
||||
assert (
|
||||
base64.b64decode(file_part["file"]["file_data"].split(",", 1)[1])
|
||||
== tiny_pdf.read_bytes() # noqa: ASYNC240 — test fixture, sync read is fine
|
||||
)
|
||||
assert user["content"][1] == {"type": "text", "text": "What is the diagnosis?"}
|
||||
assert captured["headers"]["authorization"] == "Bearer sk-or-test"
|
||||
assert captured["headers"].get("x-title") == "SurfSense-evals"
|
||||
|
||||
assert response.text == "Answer: B"
|
||||
assert response.input_tokens == 10
|
||||
assert response.output_tokens == 5
|
||||
assert response.total_tokens == 15
|
||||
# cost 0.0001 USD == 100 micros
|
||||
assert response.cost_micros == 100
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@respx.mock(base_url=_BASE)
|
||||
async def test_chat_array_content_concatenates(respx_mock, tiny_pdf: Path):
|
||||
respx_mock.post("/chat/completions").mock(
|
||||
return_value=httpx.Response(
|
||||
200,
|
||||
json={
|
||||
"choices": [{
|
||||
"message": {
|
||||
"content": [
|
||||
{"type": "text", "text": "Hello "},
|
||||
{"type": "text", "text": "world"},
|
||||
{"type": "image_url", "image_url": "ignored"},
|
||||
]
|
||||
}
|
||||
}],
|
||||
"usage": {"prompt_tokens": 1, "completion_tokens": 1},
|
||||
},
|
||||
)
|
||||
)
|
||||
provider = OpenRouterPdfProvider(
|
||||
api_key="sk-or-test", base_url=_BASE, model="x/y"
|
||||
)
|
||||
response = await provider.complete(prompt="hi", pdf_path=tiny_pdf)
|
||||
assert response.text == "Hello world"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@respx.mock(base_url=_BASE)
|
||||
async def test_provider_raises_on_4xx(respx_mock, tiny_pdf: Path):
|
||||
respx_mock.post("/chat/completions").mock(
|
||||
return_value=httpx.Response(429, json={"error": {"message": "rate limited"}})
|
||||
)
|
||||
provider = OpenRouterPdfProvider(api_key="sk-or-test", base_url=_BASE, model="x/y")
|
||||
with pytest.raises(httpx.HTTPStatusError):
|
||||
await provider.complete(prompt="hi", pdf_path=tiny_pdf)
|
||||
|
||||
|
||||
def test_missing_api_key_raises():
|
||||
with pytest.raises(ValueError):
|
||||
OpenRouterPdfProvider(api_key="", base_url=_BASE, model="x/y")
|
||||
58
surfsense_evals/tests/core/test_registry.py
Normal file
58
surfsense_evals/tests/core/test_registry.py
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
"""Registry + auto-discovery tests.
|
||||
|
||||
* Auto-discovery skips packages starting with ``_`` (so test fixtures
|
||||
don't leak into the production catalogue).
|
||||
* Manually importing a ``_demo`` benchmark fires its ``register(...)``
|
||||
call and the CLI sees it.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
|
||||
from surfsense_evals.core import registry
|
||||
|
||||
|
||||
def _force_register_demo() -> None:
|
||||
"""Import (or reload) the demo module so its ``register(...)`` runs.
|
||||
|
||||
On a fresh interpreter, ``import_module`` triggers package
|
||||
initialization. After the first call though, the module is cached
|
||||
in ``sys.modules`` and a second ``import_module`` is a no-op — so
|
||||
if a previous test already unregistered the entry, we have to
|
||||
``reload`` to re-execute the module body.
|
||||
"""
|
||||
|
||||
module = importlib.import_module("surfsense_evals.suites._demo.hello")
|
||||
if ("_demo", "hello") not in registry.snapshot():
|
||||
importlib.reload(module)
|
||||
|
||||
|
||||
def test_auto_discovery_skips_underscore_prefixed_subpackages():
|
||||
from surfsense_evals.suites import discover_suites
|
||||
|
||||
discovered = discover_suites()
|
||||
assert all(not part.startswith("_") for full in discovered for part in full.split("."))
|
||||
# The medical suite's headline benchmark must always discover.
|
||||
assert any(name.endswith(".medical.medxpertqa") for name in discovered)
|
||||
|
||||
|
||||
def test_demo_benchmark_registers_on_explicit_import():
|
||||
_force_register_demo()
|
||||
bench = registry.get("_demo", "hello")
|
||||
assert bench is not None
|
||||
assert bench.name == "hello"
|
||||
assert bench.headline is False
|
||||
# Cleanup so the test is idempotent under repeated runs.
|
||||
registry.unregister("_demo", "hello")
|
||||
|
||||
|
||||
def test_register_unregister_roundtrip():
|
||||
# Make sure no stale entry from a prior test in the session.
|
||||
if ("_demo", "hello") in registry.snapshot():
|
||||
registry.unregister("_demo", "hello")
|
||||
snapshot_before = dict(registry.snapshot())
|
||||
_force_register_demo()
|
||||
assert ("_demo", "hello") in registry.snapshot()
|
||||
registry.unregister("_demo", "hello")
|
||||
assert dict(registry.snapshot()) == snapshot_before
|
||||
68
surfsense_evals/tests/core/test_scenarios.py
Normal file
68
surfsense_evals/tests/core/test_scenarios.py
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
"""Tests for the shared scenario formatter used in head-to-head reports."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from surfsense_evals.core.scenarios import format_scenario_md
|
||||
|
||||
|
||||
def test_head_to_head_renders_both_arms_same_slug():
|
||||
extra = {
|
||||
"scenario": "head-to-head",
|
||||
"provider_model": "anthropic/claude-sonnet-4.5",
|
||||
}
|
||||
line = format_scenario_md(extra)
|
||||
assert "head-to-head" in line
|
||||
assert "anthropic/claude-sonnet-4.5" in line
|
||||
|
||||
|
||||
def test_head_to_head_includes_vision_slug_when_recorded():
|
||||
extra = {
|
||||
"scenario": "head-to-head",
|
||||
"provider_model": "anthropic/claude-sonnet-4.5",
|
||||
"vision_provider_model": "anthropic/claude-sonnet-4.5",
|
||||
}
|
||||
line = format_scenario_md(extra)
|
||||
assert "ingest VLM" in line
|
||||
assert "claude-sonnet-4.5" in line
|
||||
|
||||
|
||||
def test_symmetric_cheap_calls_out_native_arm_disadvantage():
|
||||
extra = {
|
||||
"scenario": "symmetric-cheap",
|
||||
"provider_model": "openai/gpt-5.4-mini",
|
||||
"vision_provider_model": "anthropic/claude-sonnet-4.5",
|
||||
}
|
||||
line = format_scenario_md(extra)
|
||||
assert "**symmetric-cheap**" in line
|
||||
assert "gpt-5.4-mini" in line
|
||||
# The "structurally loses" disclaimer must be there so reviewers
|
||||
# don't read this as a fair comparison.
|
||||
assert "structurally loses" in line.lower() or "structurally_loses" in line.lower()
|
||||
|
||||
|
||||
def test_cost_arbitrage_distinguishes_native_and_surfsense_slugs():
|
||||
extra = {
|
||||
"scenario": "cost-arbitrage",
|
||||
"provider_model": "openai/gpt-5.4-mini",
|
||||
"native_arm_model": "anthropic/claude-sonnet-4.5",
|
||||
"vision_provider_model": "anthropic/claude-sonnet-4.5",
|
||||
}
|
||||
line = format_scenario_md(extra)
|
||||
assert "**cost-arbitrage**" in line
|
||||
# Both slugs surface; reader can see the asymmetry at a glance.
|
||||
assert "anthropic/claude-sonnet-4.5" in line
|
||||
assert "openai/gpt-5.4-mini" in line
|
||||
assert "fraction of the per-query cost" in line
|
||||
|
||||
|
||||
def test_legacy_artifact_without_scenario_renders_as_head_to_head():
|
||||
"""Old run_artifact.json files don't have ``scenario`` — must still render."""
|
||||
|
||||
extra = {"provider_model": "anthropic/claude-sonnet-4.5"}
|
||||
line = format_scenario_md(extra)
|
||||
assert "head-to-head" in line
|
||||
|
||||
|
||||
def test_none_extra_does_not_crash():
|
||||
line = format_scenario_md(None)
|
||||
assert "head-to-head" in line
|
||||
121
surfsense_evals/tests/core/test_vision_llm.py
Normal file
121
surfsense_evals/tests/core/test_vision_llm.py
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
"""Tests for vision LLM auto-pick + explicit-slug resolution."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from surfsense_evals.core.clients.search_space import VisionLlmConfigEntry
|
||||
from surfsense_evals.core.vision_llm import (
|
||||
RECOMMENDED_VISION_PRIORITY,
|
||||
VisionConfigError,
|
||||
resolve_vision_llm,
|
||||
)
|
||||
|
||||
|
||||
def _entry(*, id: int, model_name: str, provider: str = "OPENROUTER") -> VisionLlmConfigEntry:
|
||||
return VisionLlmConfigEntry(
|
||||
id=id,
|
||||
name=f"OpenRouter • {model_name}",
|
||||
provider=provider,
|
||||
model_name=model_name,
|
||||
is_auto_mode=False,
|
||||
raw={},
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Explicit slug resolution
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_explicit_slug_resolves_to_matching_config_id():
|
||||
candidates = [
|
||||
_entry(id=-101, model_name="anthropic/claude-sonnet-4.5"),
|
||||
_entry(id=-102, model_name="openai/gpt-5"),
|
||||
]
|
||||
resolved = resolve_vision_llm(candidates, explicit_slug="openai/gpt-5")
|
||||
assert resolved.config_id == -102
|
||||
assert resolved.provider_model == "openai/gpt-5"
|
||||
assert resolved.selected_via == "explicit"
|
||||
|
||||
|
||||
def test_explicit_slug_with_no_match_raises_with_helpful_listing():
|
||||
candidates = [_entry(id=-101, model_name="anthropic/claude-sonnet-4.5")]
|
||||
with pytest.raises(VisionConfigError) as exc_info:
|
||||
resolve_vision_llm(candidates, explicit_slug="some/missing-slug")
|
||||
msg = str(exc_info.value)
|
||||
assert "some/missing-slug" in msg
|
||||
assert "anthropic/claude-sonnet-4.5" in msg # surfaced as a sample
|
||||
|
||||
|
||||
def test_explicit_slug_skips_non_openrouter_entries():
|
||||
"""A YAML BYOK entry with a colliding model_name shouldn't accidentally match."""
|
||||
|
||||
candidates = [
|
||||
_entry(id=42, model_name="openai/gpt-5", provider="OPENAI"),
|
||||
_entry(id=-101, model_name="openai/gpt-5"),
|
||||
]
|
||||
resolved = resolve_vision_llm(candidates, explicit_slug="openai/gpt-5")
|
||||
assert resolved.config_id == -101 # the OpenRouter one, not the BYOK one
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Auto-pick by recommended priority
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_auto_pick_walks_priority_list_in_order():
|
||||
candidates = [
|
||||
_entry(id=-300, model_name="google/gemini-2.5-pro"),
|
||||
_entry(id=-200, model_name="anthropic/claude-opus-4.7"),
|
||||
_entry(id=-100, model_name="anthropic/claude-sonnet-4.5"),
|
||||
]
|
||||
resolved = resolve_vision_llm(candidates, explicit_slug=None)
|
||||
# claude-sonnet-4.5 is first in the priority tuple, so it wins.
|
||||
assert resolved.config_id == -100
|
||||
assert resolved.provider_model == "anthropic/claude-sonnet-4.5"
|
||||
assert resolved.selected_via == "auto-priority"
|
||||
|
||||
|
||||
def test_auto_pick_skips_to_next_priority_when_first_unavailable():
|
||||
candidates = [
|
||||
_entry(id=-200, model_name="anthropic/claude-opus-4.7"),
|
||||
_entry(id=-300, model_name="google/gemini-2.5-pro"),
|
||||
]
|
||||
resolved = resolve_vision_llm(candidates, explicit_slug=None)
|
||||
# claude-sonnet-4.5 not registered → claude-opus-4.7 is next in priority.
|
||||
assert resolved.provider_model == "anthropic/claude-opus-4.7"
|
||||
assert resolved.selected_via == "auto-priority"
|
||||
|
||||
|
||||
def test_auto_pick_falls_back_to_first_openrouter_when_no_recommended_match():
|
||||
candidates = [
|
||||
_entry(id=-700, model_name="some/exotic-vision-model"),
|
||||
_entry(id=-800, model_name="another/exotic-vision-model"),
|
||||
]
|
||||
resolved = resolve_vision_llm(candidates, explicit_slug=None)
|
||||
# Neither matches the priority list → first OpenRouter entry wins.
|
||||
assert resolved.config_id == -700
|
||||
assert resolved.selected_via == "auto-fallback"
|
||||
|
||||
|
||||
def test_auto_pick_with_zero_openrouter_candidates_raises():
|
||||
candidates: list[VisionLlmConfigEntry] = []
|
||||
with pytest.raises(VisionConfigError) as exc_info:
|
||||
resolve_vision_llm(candidates, explicit_slug=None)
|
||||
assert "vision_enabled: true" in str(exc_info.value)
|
||||
|
||||
|
||||
def test_auto_pick_ignores_non_openrouter_entries():
|
||||
candidates = [
|
||||
_entry(id=99, model_name="anthropic/claude-sonnet-4.5", provider="ANTHROPIC"),
|
||||
]
|
||||
with pytest.raises(VisionConfigError):
|
||||
resolve_vision_llm(candidates, explicit_slug=None)
|
||||
|
||||
|
||||
def test_recommended_priority_is_a_stable_public_list():
|
||||
"""If you reorder this, update the README's auto-pick claim too."""
|
||||
|
||||
assert RECOMMENDED_VISION_PRIORITY[0] == "anthropic/claude-sonnet-4.5"
|
||||
assert "google/gemini-2.5-pro" in RECOMMENDED_VISION_PRIORITY
|
||||
Loading…
Add table
Add a link
Reference in a new issue