test: add agent refactor guardrail suite

2026-07-22 23:31:12 +02:00 · 2026-06-04 11:44:23 +02:00 · 2026-06-04 11:44:23 +02:00 · fb70e23dd2
commit fb70e23dd2
parent cb44063081
4 changed files with 326 additions and 0 deletions
--- a/surfsense_backend/tests/integration/agents/multi_agent_chat/test_agent_turn.py
+++ b/surfsense_backend/tests/integration/agents/multi_agent_chat/test_agent_turn.py
@ -0,0 +1,142 @@
 """Guardrail D: the real multi-agent is still assemblable and runnable.
 Builds the production ``create_multi_agent_chat_deep_agent`` factory against a
 real (test) DB with a scripted LLM, then drives one turn. This is the only
 guard that proves the *assembled* agent — full tool registry, middleware stack,
 compiled graph — still executes end to end after files move. A/B/C prove the
 parts import, wire, and load; this proves they run together.
 Scripted LLM + faked external tools; everything we own (graph, middleware,
 DB-backed connector service) runs for real.
 """
 from __future__ import annotations
 import pytest
 from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
 from langgraph.checkpoint.memory import InMemorySaver
 from app.agents.multi_agent_chat import create_multi_agent_chat_deep_agent
 from app.services.connector_service import ConnectorService
 from tests.integration.harness import (
    ScriptedTurn,
    StubToolSpec,
    build_scripted_harness,
 )
 pytestmark = pytest.mark.integration
 def _last_ai_text(messages: list) -> str | None:
    for m in reversed(messages):
        if isinstance(m, AIMessage):
            return m.content if isinstance(m.content, str) else str(m.content)
    return None
@pytest.mark.asyncio
 async def test_agent_runs_a_scripted_text_turn(db_session, db_user, db_search_space):
    """A freshly assembled agent streams a scripted final-text turn to completion."""
    harness = build_scripted_harness(turns=[ScriptedTurn(text="done")])
    agent = await create_multi_agent_chat_deep_agent(
        llm=harness.model,
        search_space_id=db_search_space.id,
        db_session=db_session,
        connector_service=ConnectorService(db_session),
        checkpointer=InMemorySaver(),
        user_id=str(db_user.id),
        thread_id=db_search_space.id,
        agent_config=None,
    )
    result = await agent.ainvoke(
        {"messages": [HumanMessage(content="hello")]},
        config={"configurable": {"thread_id": "guard-d-thread-1"}},
    )
    assert _last_ai_text(result["messages"]) == "done"
@pytest.mark.asyncio
 async def test_agent_routes_a_scripted_tool_call(db_session, db_user, db_search_space):
    """The compiled graph routes a model tool call to its tool and resumes."""
    harness = build_scripted_harness(
        turns=[
            ScriptedTurn(
                tool_calls=[{"name": "echo", "args": {"x": 1}, "id": "call_1"}]
            ),
            ScriptedTurn(text="echoed"),
        ],
        tools=[
            StubToolSpec(
                name="echo",
                description="Echo the args back.",
                handler=lambda **kwargs: {"echoed": kwargs},
            ),
        ],
    )
    agent = await create_multi_agent_chat_deep_agent(
        llm=harness.model,
        search_space_id=db_search_space.id,
        db_session=db_session,
        connector_service=ConnectorService(db_session),
        checkpointer=InMemorySaver(),
        user_id=str(db_user.id),
        thread_id=db_search_space.id,
        agent_config=None,
        additional_tools=harness.tools,
    )
    result = await agent.ainvoke(
        {"messages": [HumanMessage(content="echo please")]},
        config={"configurable": {"thread_id": "guard-d-thread-2"}},
    )
    tool_messages = [m for m in result["messages"] if isinstance(m, ToolMessage)]
    assert any("echoed" in str(m.content) for m in tool_messages)
    assert _last_ai_text(result["messages"]) == "echoed"
@pytest.mark.asyncio
 async def test_agent_checkpoint_round_trips_across_turns(
    db_session, db_user, db_search_space
 ):
    """Turn 2 sees turn 1's history, proving the checkpoint serializes and reloads.
    Uses InMemorySaver, which serializes via the same ``JsonPlusSerializer`` as
    the production Postgres checkpointer — so a state class that became
    unserializable after a module move would fail here too.
    """
    harness = build_scripted_harness(
        turns=[ScriptedTurn(text="ok-one"), ScriptedTurn(text="ok-two")]
    )
    checkpointer = InMemorySaver()
    config = {"configurable": {"thread_id": "guard-e-thread-1"}}
    async def _build():
        return await create_multi_agent_chat_deep_agent(
            llm=harness.model,
            search_space_id=db_search_space.id,
            db_session=db_session,
            connector_service=ConnectorService(db_session),
            checkpointer=checkpointer,
            user_id=str(db_user.id),
            thread_id=db_search_space.id,
            agent_config=None,
        )
    agent = await _build()
    first = await agent.ainvoke(
        {"messages": [HumanMessage(content="remember apple")]}, config
    )
    second = await agent.ainvoke(
        {"messages": [HumanMessage(content="second turn")]}, config
    )
    texts = [
        m.content for m in second["messages"] if isinstance(m, HumanMessage)
    ]
    assert "remember apple" in texts, "turn 1 history not reloaded from checkpoint"
    assert len(second["messages"]) > len(first["messages"])
--- a/surfsense_backend/tests/unit/agents/multi_agent_chat/test_prompt_resources.py
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/test_prompt_resources.py
@ -0,0 +1,59 @@
 """Guardrail C: package-relative prompt/snippet resources must resolve.
 Prompt fragments are loaded by *package name* via ``importlib.resources`` — not
 by import, so the import-all smoke test (guardrail A) cannot see them, and not
 by mocked unit tests. A move that relocates a package without its ``.md`` files,
 or that leaves a hardcoded package string stale, returns an empty string and
 silently degrades the system prompt. These tests assert the resources still
 resolve to non-empty content.
 (Builtin skill resources are covered separately by ``test_skills_backends.py``.)
 """
 from __future__ import annotations
 import pytest
 from app.agents.multi_agent_chat.main_agent.system_prompt.builder.load_md import (
    read_prompt_md,
 )
 from app.agents.multi_agent_chat.subagents.registry import (
    SUBAGENT_BUILDERS_BY_NAME,
    _route_resource_package,
 )
 from app.agents.multi_agent_chat.subagents.shared.md_file_reader import (
    read_md_file,
    read_shared_snippet,
 )
 pytestmark = pytest.mark.unit
@pytest.mark.parametrize("name", sorted(SUBAGENT_BUILDERS_BY_NAME))
 def test_every_subagent_has_description_md(name: str):
    """Each specialist ships a non-empty ``description.md`` next to its agent."""
    package = _route_resource_package(SUBAGENT_BUILDERS_BY_NAME[name])
    assert read_md_file(package, "description").strip(), (
        f"{name}: description.md missing/empty at package {package}"
    )
 # Real fragments under the hardcoded main-agent prompts package, including a
 # nested path — guards both the package string and nested resource resolution.
@pytest.mark.parametrize(
    "filename",
    [
        "core_behavior.md",
        "routing.md",
        "tools/web_search/description.md",
    ],
 )
 def test_main_agent_prompt_fragments_resolve(filename: str):
    """Main-agent prompt fragments resolve to non-empty content."""
    assert read_prompt_md(filename).strip(), f"prompt fragment {filename} is empty"
@pytest.mark.parametrize("snippet", ["output_contract_base", "verifiable_handle"])
 def test_shared_snippets_resolve(snippet: str):
    """Shared subagent snippets resolve from the snippets package."""
    assert read_shared_snippet(snippet).strip(), f"snippet {snippet} is empty"
--- a/surfsense_backend/tests/unit/agents/multi_agent_chat/test_subagent_composition.py
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/test_subagent_composition.py
@ -0,0 +1,72 @@
 """Guardrail B: the subagent registry composition must stay intact.
 A structural move can silently drop, rename, or mis-wire a subagent builder
 (e.g. a forgotten import line). The compiled agent would then quietly lose a
 specialist with no ImportError. This test pins the exact registry contents and
 their cross-references so any such drift fails loudly.
 """
 from __future__ import annotations
 import pytest
 from app.agents.multi_agent_chat.constants import (
    SUBAGENT_TO_REQUIRED_CONNECTOR_MAP,
 )
 from app.agents.multi_agent_chat.subagents.registry import (
    SUBAGENT_BUILDERS_BY_NAME,
 )
 pytestmark = pytest.mark.unit
 # The full specialist roster the main agent composes from: 4 builtins + 15
 # connector routes. Adding/removing a specialist is a deliberate product change
 # and must be reflected here.
 _EXPECTED_SUBAGENTS = frozenset(
    {
        "airtable",
        "calendar",
        "clickup",
        "confluence",
        "deliverables",
        "discord",
        "dropbox",
        "gmail",
        "google_drive",
        "jira",
        "knowledge_base",
        "linear",
        "luma",
        "memory",
        "notion",
        "onedrive",
        "research",
        "slack",
        "teams",
    }
 )
 # Specialists that are always available regardless of connected sources, so they
 # carry no required-connector entry.
 _CONNECTORLESS = frozenset({"memory", "research"})
 def test_registry_contains_exactly_expected_subagents():
    """No specialist is silently added, dropped, or renamed by a move."""
    assert set(SUBAGENT_BUILDERS_BY_NAME) == _EXPECTED_SUBAGENTS
 def test_every_builder_is_callable_route_agent():
    """Each registry value is a callable defined in its route's ``agent`` module."""
    for name, builder in SUBAGENT_BUILDERS_BY_NAME.items():
        assert callable(builder), f"{name} builder is not callable"
        assert builder.__module__.endswith(".agent"), (
            f"{name} builder lives in {builder.__module__}, expected a *.agent module"
        )
 def test_required_connector_map_covers_connector_subagents():
    """The connector-gating map stays in lockstep with the registry."""
    assert set(SUBAGENT_TO_REQUIRED_CONNECTOR_MAP) == (
        _EXPECTED_SUBAGENTS - _CONNECTORLESS
    )
--- a/surfsense_backend/tests/unit/agents/test_import_all.py
+++ b/surfsense_backend/tests/unit/agents/test_import_all.py
@ -0,0 +1,53 @@
 """Guardrail A: every agent module (and its prod entrypoints) must import.
 Static reachability analysis and mocked unit tests cannot catch a module that
 fails to import after files move or imports are rewritten. This smoke test
 imports every submodule under ``app.agents`` plus the production entrypoints
 that consume agents, turning a move-time ``ImportError`` into a fast, local CI
 signal instead of a runtime failure in prod.
 """
 from __future__ import annotations
 import importlib
 import pkgutil
 import pytest
 import app.agents as agents_pkg
 pytestmark = pytest.mark.unit
 # Prod consumers of app.agents that live OUTSIDE the agents tree; a broken
 # importer here would not be caught by walking app.agents alone.
 _PROD_ENTRYPOINTS = [
    "app.tasks.chat.streaming.flows.new_chat.orchestrator",
    "app.tasks.chat.streaming.agent.builder",
    "app.gateway.agent_invoke",
    "app.routes.new_chat_routes",
 ]
 def _iter_agent_modules() -> list[str]:
    names: list[str] = []
    def _record(name: str) -> None:
        names.append(name)
    for info in pkgutil.walk_packages(
        agents_pkg.__path__, prefix=agents_pkg.__name__ + ".", onerror=_record
    ):
        names.append(info.name)
    return sorted(set(names))
@pytest.mark.parametrize("module_name", _iter_agent_modules())
 def test_agent_module_imports(module_name: str) -> None:
    """Importing the module must not raise (no broken or missed imports)."""
    importlib.import_module(module_name)
@pytest.mark.parametrize("module_name", _PROD_ENTRYPOINTS)
 def test_prod_entrypoint_imports(module_name: str) -> None:
    """The production code paths that build/invoke agents must import."""
    importlib.import_module(module_name)