mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-06 20:15:17 +02:00
test: add agent refactor guardrail suite
This commit is contained in:
parent
cb44063081
commit
fb70e23dd2
4 changed files with 326 additions and 0 deletions
|
|
@ -0,0 +1,142 @@
|
||||||
|
"""Guardrail D: the real multi-agent is still assemblable and runnable.
|
||||||
|
|
||||||
|
Builds the production ``create_multi_agent_chat_deep_agent`` factory against a
|
||||||
|
real (test) DB with a scripted LLM, then drives one turn. This is the only
|
||||||
|
guard that proves the *assembled* agent — full tool registry, middleware stack,
|
||||||
|
compiled graph — still executes end to end after files move. A/B/C prove the
|
||||||
|
parts import, wire, and load; this proves they run together.
|
||||||
|
|
||||||
|
Scripted LLM + faked external tools; everything we own (graph, middleware,
|
||||||
|
DB-backed connector service) runs for real.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
|
||||||
|
from langgraph.checkpoint.memory import InMemorySaver
|
||||||
|
|
||||||
|
from app.agents.multi_agent_chat import create_multi_agent_chat_deep_agent
|
||||||
|
from app.services.connector_service import ConnectorService
|
||||||
|
from tests.integration.harness import (
|
||||||
|
ScriptedTurn,
|
||||||
|
StubToolSpec,
|
||||||
|
build_scripted_harness,
|
||||||
|
)
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.integration
|
||||||
|
|
||||||
|
|
||||||
|
def _last_ai_text(messages: list) -> str | None:
|
||||||
|
for m in reversed(messages):
|
||||||
|
if isinstance(m, AIMessage):
|
||||||
|
return m.content if isinstance(m.content, str) else str(m.content)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_agent_runs_a_scripted_text_turn(db_session, db_user, db_search_space):
|
||||||
|
"""A freshly assembled agent streams a scripted final-text turn to completion."""
|
||||||
|
harness = build_scripted_harness(turns=[ScriptedTurn(text="done")])
|
||||||
|
|
||||||
|
agent = await create_multi_agent_chat_deep_agent(
|
||||||
|
llm=harness.model,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
db_session=db_session,
|
||||||
|
connector_service=ConnectorService(db_session),
|
||||||
|
checkpointer=InMemorySaver(),
|
||||||
|
user_id=str(db_user.id),
|
||||||
|
thread_id=db_search_space.id,
|
||||||
|
agent_config=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = await agent.ainvoke(
|
||||||
|
{"messages": [HumanMessage(content="hello")]},
|
||||||
|
config={"configurable": {"thread_id": "guard-d-thread-1"}},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert _last_ai_text(result["messages"]) == "done"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_agent_routes_a_scripted_tool_call(db_session, db_user, db_search_space):
|
||||||
|
"""The compiled graph routes a model tool call to its tool and resumes."""
|
||||||
|
harness = build_scripted_harness(
|
||||||
|
turns=[
|
||||||
|
ScriptedTurn(
|
||||||
|
tool_calls=[{"name": "echo", "args": {"x": 1}, "id": "call_1"}]
|
||||||
|
),
|
||||||
|
ScriptedTurn(text="echoed"),
|
||||||
|
],
|
||||||
|
tools=[
|
||||||
|
StubToolSpec(
|
||||||
|
name="echo",
|
||||||
|
description="Echo the args back.",
|
||||||
|
handler=lambda **kwargs: {"echoed": kwargs},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
agent = await create_multi_agent_chat_deep_agent(
|
||||||
|
llm=harness.model,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
db_session=db_session,
|
||||||
|
connector_service=ConnectorService(db_session),
|
||||||
|
checkpointer=InMemorySaver(),
|
||||||
|
user_id=str(db_user.id),
|
||||||
|
thread_id=db_search_space.id,
|
||||||
|
agent_config=None,
|
||||||
|
additional_tools=harness.tools,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = await agent.ainvoke(
|
||||||
|
{"messages": [HumanMessage(content="echo please")]},
|
||||||
|
config={"configurable": {"thread_id": "guard-d-thread-2"}},
|
||||||
|
)
|
||||||
|
|
||||||
|
tool_messages = [m for m in result["messages"] if isinstance(m, ToolMessage)]
|
||||||
|
assert any("echoed" in str(m.content) for m in tool_messages)
|
||||||
|
assert _last_ai_text(result["messages"]) == "echoed"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_agent_checkpoint_round_trips_across_turns(
|
||||||
|
db_session, db_user, db_search_space
|
||||||
|
):
|
||||||
|
"""Turn 2 sees turn 1's history, proving the checkpoint serializes and reloads.
|
||||||
|
|
||||||
|
Uses InMemorySaver, which serializes via the same ``JsonPlusSerializer`` as
|
||||||
|
the production Postgres checkpointer — so a state class that became
|
||||||
|
unserializable after a module move would fail here too.
|
||||||
|
"""
|
||||||
|
harness = build_scripted_harness(
|
||||||
|
turns=[ScriptedTurn(text="ok-one"), ScriptedTurn(text="ok-two")]
|
||||||
|
)
|
||||||
|
checkpointer = InMemorySaver()
|
||||||
|
config = {"configurable": {"thread_id": "guard-e-thread-1"}}
|
||||||
|
|
||||||
|
async def _build():
|
||||||
|
return await create_multi_agent_chat_deep_agent(
|
||||||
|
llm=harness.model,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
db_session=db_session,
|
||||||
|
connector_service=ConnectorService(db_session),
|
||||||
|
checkpointer=checkpointer,
|
||||||
|
user_id=str(db_user.id),
|
||||||
|
thread_id=db_search_space.id,
|
||||||
|
agent_config=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
agent = await _build()
|
||||||
|
first = await agent.ainvoke(
|
||||||
|
{"messages": [HumanMessage(content="remember apple")]}, config
|
||||||
|
)
|
||||||
|
second = await agent.ainvoke(
|
||||||
|
{"messages": [HumanMessage(content="second turn")]}, config
|
||||||
|
)
|
||||||
|
|
||||||
|
texts = [
|
||||||
|
m.content for m in second["messages"] if isinstance(m, HumanMessage)
|
||||||
|
]
|
||||||
|
assert "remember apple" in texts, "turn 1 history not reloaded from checkpoint"
|
||||||
|
assert len(second["messages"]) > len(first["messages"])
|
||||||
|
|
@ -0,0 +1,59 @@
|
||||||
|
"""Guardrail C: package-relative prompt/snippet resources must resolve.
|
||||||
|
|
||||||
|
Prompt fragments are loaded by *package name* via ``importlib.resources`` — not
|
||||||
|
by import, so the import-all smoke test (guardrail A) cannot see them, and not
|
||||||
|
by mocked unit tests. A move that relocates a package without its ``.md`` files,
|
||||||
|
or that leaves a hardcoded package string stale, returns an empty string and
|
||||||
|
silently degrades the system prompt. These tests assert the resources still
|
||||||
|
resolve to non-empty content.
|
||||||
|
|
||||||
|
(Builtin skill resources are covered separately by ``test_skills_backends.py``.)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.agents.multi_agent_chat.main_agent.system_prompt.builder.load_md import (
|
||||||
|
read_prompt_md,
|
||||||
|
)
|
||||||
|
from app.agents.multi_agent_chat.subagents.registry import (
|
||||||
|
SUBAGENT_BUILDERS_BY_NAME,
|
||||||
|
_route_resource_package,
|
||||||
|
)
|
||||||
|
from app.agents.multi_agent_chat.subagents.shared.md_file_reader import (
|
||||||
|
read_md_file,
|
||||||
|
read_shared_snippet,
|
||||||
|
)
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.unit
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("name", sorted(SUBAGENT_BUILDERS_BY_NAME))
|
||||||
|
def test_every_subagent_has_description_md(name: str):
|
||||||
|
"""Each specialist ships a non-empty ``description.md`` next to its agent."""
|
||||||
|
package = _route_resource_package(SUBAGENT_BUILDERS_BY_NAME[name])
|
||||||
|
assert read_md_file(package, "description").strip(), (
|
||||||
|
f"{name}: description.md missing/empty at package {package}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Real fragments under the hardcoded main-agent prompts package, including a
|
||||||
|
# nested path — guards both the package string and nested resource resolution.
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"filename",
|
||||||
|
[
|
||||||
|
"core_behavior.md",
|
||||||
|
"routing.md",
|
||||||
|
"tools/web_search/description.md",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_main_agent_prompt_fragments_resolve(filename: str):
|
||||||
|
"""Main-agent prompt fragments resolve to non-empty content."""
|
||||||
|
assert read_prompt_md(filename).strip(), f"prompt fragment {filename} is empty"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("snippet", ["output_contract_base", "verifiable_handle"])
|
||||||
|
def test_shared_snippets_resolve(snippet: str):
|
||||||
|
"""Shared subagent snippets resolve from the snippets package."""
|
||||||
|
assert read_shared_snippet(snippet).strip(), f"snippet {snippet} is empty"
|
||||||
|
|
@ -0,0 +1,72 @@
|
||||||
|
"""Guardrail B: the subagent registry composition must stay intact.
|
||||||
|
|
||||||
|
A structural move can silently drop, rename, or mis-wire a subagent builder
|
||||||
|
(e.g. a forgotten import line). The compiled agent would then quietly lose a
|
||||||
|
specialist with no ImportError. This test pins the exact registry contents and
|
||||||
|
their cross-references so any such drift fails loudly.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.agents.multi_agent_chat.constants import (
|
||||||
|
SUBAGENT_TO_REQUIRED_CONNECTOR_MAP,
|
||||||
|
)
|
||||||
|
from app.agents.multi_agent_chat.subagents.registry import (
|
||||||
|
SUBAGENT_BUILDERS_BY_NAME,
|
||||||
|
)
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.unit
|
||||||
|
|
||||||
|
# The full specialist roster the main agent composes from: 4 builtins + 15
|
||||||
|
# connector routes. Adding/removing a specialist is a deliberate product change
|
||||||
|
# and must be reflected here.
|
||||||
|
_EXPECTED_SUBAGENTS = frozenset(
|
||||||
|
{
|
||||||
|
"airtable",
|
||||||
|
"calendar",
|
||||||
|
"clickup",
|
||||||
|
"confluence",
|
||||||
|
"deliverables",
|
||||||
|
"discord",
|
||||||
|
"dropbox",
|
||||||
|
"gmail",
|
||||||
|
"google_drive",
|
||||||
|
"jira",
|
||||||
|
"knowledge_base",
|
||||||
|
"linear",
|
||||||
|
"luma",
|
||||||
|
"memory",
|
||||||
|
"notion",
|
||||||
|
"onedrive",
|
||||||
|
"research",
|
||||||
|
"slack",
|
||||||
|
"teams",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Specialists that are always available regardless of connected sources, so they
|
||||||
|
# carry no required-connector entry.
|
||||||
|
_CONNECTORLESS = frozenset({"memory", "research"})
|
||||||
|
|
||||||
|
|
||||||
|
def test_registry_contains_exactly_expected_subagents():
|
||||||
|
"""No specialist is silently added, dropped, or renamed by a move."""
|
||||||
|
assert set(SUBAGENT_BUILDERS_BY_NAME) == _EXPECTED_SUBAGENTS
|
||||||
|
|
||||||
|
|
||||||
|
def test_every_builder_is_callable_route_agent():
|
||||||
|
"""Each registry value is a callable defined in its route's ``agent`` module."""
|
||||||
|
for name, builder in SUBAGENT_BUILDERS_BY_NAME.items():
|
||||||
|
assert callable(builder), f"{name} builder is not callable"
|
||||||
|
assert builder.__module__.endswith(".agent"), (
|
||||||
|
f"{name} builder lives in {builder.__module__}, expected a *.agent module"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_required_connector_map_covers_connector_subagents():
|
||||||
|
"""The connector-gating map stays in lockstep with the registry."""
|
||||||
|
assert set(SUBAGENT_TO_REQUIRED_CONNECTOR_MAP) == (
|
||||||
|
_EXPECTED_SUBAGENTS - _CONNECTORLESS
|
||||||
|
)
|
||||||
53
surfsense_backend/tests/unit/agents/test_import_all.py
Normal file
53
surfsense_backend/tests/unit/agents/test_import_all.py
Normal file
|
|
@ -0,0 +1,53 @@
|
||||||
|
"""Guardrail A: every agent module (and its prod entrypoints) must import.
|
||||||
|
|
||||||
|
Static reachability analysis and mocked unit tests cannot catch a module that
|
||||||
|
fails to import after files move or imports are rewritten. This smoke test
|
||||||
|
imports every submodule under ``app.agents`` plus the production entrypoints
|
||||||
|
that consume agents, turning a move-time ``ImportError`` into a fast, local CI
|
||||||
|
signal instead of a runtime failure in prod.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import importlib
|
||||||
|
import pkgutil
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import app.agents as agents_pkg
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.unit
|
||||||
|
|
||||||
|
# Prod consumers of app.agents that live OUTSIDE the agents tree; a broken
|
||||||
|
# importer here would not be caught by walking app.agents alone.
|
||||||
|
_PROD_ENTRYPOINTS = [
|
||||||
|
"app.tasks.chat.streaming.flows.new_chat.orchestrator",
|
||||||
|
"app.tasks.chat.streaming.agent.builder",
|
||||||
|
"app.gateway.agent_invoke",
|
||||||
|
"app.routes.new_chat_routes",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _iter_agent_modules() -> list[str]:
|
||||||
|
names: list[str] = []
|
||||||
|
|
||||||
|
def _record(name: str) -> None:
|
||||||
|
names.append(name)
|
||||||
|
|
||||||
|
for info in pkgutil.walk_packages(
|
||||||
|
agents_pkg.__path__, prefix=agents_pkg.__name__ + ".", onerror=_record
|
||||||
|
):
|
||||||
|
names.append(info.name)
|
||||||
|
return sorted(set(names))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("module_name", _iter_agent_modules())
|
||||||
|
def test_agent_module_imports(module_name: str) -> None:
|
||||||
|
"""Importing the module must not raise (no broken or missed imports)."""
|
||||||
|
importlib.import_module(module_name)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("module_name", _PROD_ENTRYPOINTS)
|
||||||
|
def test_prod_entrypoint_imports(module_name: str) -> None:
|
||||||
|
"""The production code paths that build/invoke agents must import."""
|
||||||
|
importlib.import_module(module_name)
|
||||||
Loading…
Add table
Add a link
Reference in a new issue