diff --git a/surfsense_backend/tests/integration/agents/multi_agent_chat/test_agent_turn.py b/surfsense_backend/tests/integration/agents/multi_agent_chat/test_agent_turn.py new file mode 100644 index 000000000..f8677c2e8 --- /dev/null +++ b/surfsense_backend/tests/integration/agents/multi_agent_chat/test_agent_turn.py @@ -0,0 +1,142 @@ +"""Guardrail D: the real multi-agent is still assemblable and runnable. + +Builds the production ``create_multi_agent_chat_deep_agent`` factory against a +real (test) DB with a scripted LLM, then drives one turn. This is the only +guard that proves the *assembled* agent — full tool registry, middleware stack, +compiled graph — still executes end to end after files move. A/B/C prove the +parts import, wire, and load; this proves they run together. + +Scripted LLM + faked external tools; everything we own (graph, middleware, +DB-backed connector service) runs for real. +""" + +from __future__ import annotations + +import pytest +from langchain_core.messages import AIMessage, HumanMessage, ToolMessage +from langgraph.checkpoint.memory import InMemorySaver + +from app.agents.multi_agent_chat import create_multi_agent_chat_deep_agent +from app.services.connector_service import ConnectorService +from tests.integration.harness import ( + ScriptedTurn, + StubToolSpec, + build_scripted_harness, +) + +pytestmark = pytest.mark.integration + + +def _last_ai_text(messages: list) -> str | None: + for m in reversed(messages): + if isinstance(m, AIMessage): + return m.content if isinstance(m.content, str) else str(m.content) + return None + + +@pytest.mark.asyncio +async def test_agent_runs_a_scripted_text_turn(db_session, db_user, db_search_space): + """A freshly assembled agent streams a scripted final-text turn to completion.""" + harness = build_scripted_harness(turns=[ScriptedTurn(text="done")]) + + agent = await create_multi_agent_chat_deep_agent( + llm=harness.model, + search_space_id=db_search_space.id, + db_session=db_session, + connector_service=ConnectorService(db_session), + checkpointer=InMemorySaver(), + user_id=str(db_user.id), + thread_id=db_search_space.id, + agent_config=None, + ) + + result = await agent.ainvoke( + {"messages": [HumanMessage(content="hello")]}, + config={"configurable": {"thread_id": "guard-d-thread-1"}}, + ) + + assert _last_ai_text(result["messages"]) == "done" + + +@pytest.mark.asyncio +async def test_agent_routes_a_scripted_tool_call(db_session, db_user, db_search_space): + """The compiled graph routes a model tool call to its tool and resumes.""" + harness = build_scripted_harness( + turns=[ + ScriptedTurn( + tool_calls=[{"name": "echo", "args": {"x": 1}, "id": "call_1"}] + ), + ScriptedTurn(text="echoed"), + ], + tools=[ + StubToolSpec( + name="echo", + description="Echo the args back.", + handler=lambda **kwargs: {"echoed": kwargs}, + ), + ], + ) + + agent = await create_multi_agent_chat_deep_agent( + llm=harness.model, + search_space_id=db_search_space.id, + db_session=db_session, + connector_service=ConnectorService(db_session), + checkpointer=InMemorySaver(), + user_id=str(db_user.id), + thread_id=db_search_space.id, + agent_config=None, + additional_tools=harness.tools, + ) + + result = await agent.ainvoke( + {"messages": [HumanMessage(content="echo please")]}, + config={"configurable": {"thread_id": "guard-d-thread-2"}}, + ) + + tool_messages = [m for m in result["messages"] if isinstance(m, ToolMessage)] + assert any("echoed" in str(m.content) for m in tool_messages) + assert _last_ai_text(result["messages"]) == "echoed" + + +@pytest.mark.asyncio +async def test_agent_checkpoint_round_trips_across_turns( + db_session, db_user, db_search_space +): + """Turn 2 sees turn 1's history, proving the checkpoint serializes and reloads. + + Uses InMemorySaver, which serializes via the same ``JsonPlusSerializer`` as + the production Postgres checkpointer — so a state class that became + unserializable after a module move would fail here too. + """ + harness = build_scripted_harness( + turns=[ScriptedTurn(text="ok-one"), ScriptedTurn(text="ok-two")] + ) + checkpointer = InMemorySaver() + config = {"configurable": {"thread_id": "guard-e-thread-1"}} + + async def _build(): + return await create_multi_agent_chat_deep_agent( + llm=harness.model, + search_space_id=db_search_space.id, + db_session=db_session, + connector_service=ConnectorService(db_session), + checkpointer=checkpointer, + user_id=str(db_user.id), + thread_id=db_search_space.id, + agent_config=None, + ) + + agent = await _build() + first = await agent.ainvoke( + {"messages": [HumanMessage(content="remember apple")]}, config + ) + second = await agent.ainvoke( + {"messages": [HumanMessage(content="second turn")]}, config + ) + + texts = [ + m.content for m in second["messages"] if isinstance(m, HumanMessage) + ] + assert "remember apple" in texts, "turn 1 history not reloaded from checkpoint" + assert len(second["messages"]) > len(first["messages"]) diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/test_prompt_resources.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/test_prompt_resources.py new file mode 100644 index 000000000..c724fd76f --- /dev/null +++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/test_prompt_resources.py @@ -0,0 +1,59 @@ +"""Guardrail C: package-relative prompt/snippet resources must resolve. + +Prompt fragments are loaded by *package name* via ``importlib.resources`` — not +by import, so the import-all smoke test (guardrail A) cannot see them, and not +by mocked unit tests. A move that relocates a package without its ``.md`` files, +or that leaves a hardcoded package string stale, returns an empty string and +silently degrades the system prompt. These tests assert the resources still +resolve to non-empty content. + +(Builtin skill resources are covered separately by ``test_skills_backends.py``.) +""" + +from __future__ import annotations + +import pytest + +from app.agents.multi_agent_chat.main_agent.system_prompt.builder.load_md import ( + read_prompt_md, +) +from app.agents.multi_agent_chat.subagents.registry import ( + SUBAGENT_BUILDERS_BY_NAME, + _route_resource_package, +) +from app.agents.multi_agent_chat.subagents.shared.md_file_reader import ( + read_md_file, + read_shared_snippet, +) + +pytestmark = pytest.mark.unit + + +@pytest.mark.parametrize("name", sorted(SUBAGENT_BUILDERS_BY_NAME)) +def test_every_subagent_has_description_md(name: str): + """Each specialist ships a non-empty ``description.md`` next to its agent.""" + package = _route_resource_package(SUBAGENT_BUILDERS_BY_NAME[name]) + assert read_md_file(package, "description").strip(), ( + f"{name}: description.md missing/empty at package {package}" + ) + + +# Real fragments under the hardcoded main-agent prompts package, including a +# nested path — guards both the package string and nested resource resolution. +@pytest.mark.parametrize( + "filename", + [ + "core_behavior.md", + "routing.md", + "tools/web_search/description.md", + ], +) +def test_main_agent_prompt_fragments_resolve(filename: str): + """Main-agent prompt fragments resolve to non-empty content.""" + assert read_prompt_md(filename).strip(), f"prompt fragment {filename} is empty" + + +@pytest.mark.parametrize("snippet", ["output_contract_base", "verifiable_handle"]) +def test_shared_snippets_resolve(snippet: str): + """Shared subagent snippets resolve from the snippets package.""" + assert read_shared_snippet(snippet).strip(), f"snippet {snippet} is empty" diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/test_subagent_composition.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/test_subagent_composition.py new file mode 100644 index 000000000..00ed65b50 --- /dev/null +++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/test_subagent_composition.py @@ -0,0 +1,72 @@ +"""Guardrail B: the subagent registry composition must stay intact. + +A structural move can silently drop, rename, or mis-wire a subagent builder +(e.g. a forgotten import line). The compiled agent would then quietly lose a +specialist with no ImportError. This test pins the exact registry contents and +their cross-references so any such drift fails loudly. +""" + +from __future__ import annotations + +import pytest + +from app.agents.multi_agent_chat.constants import ( + SUBAGENT_TO_REQUIRED_CONNECTOR_MAP, +) +from app.agents.multi_agent_chat.subagents.registry import ( + SUBAGENT_BUILDERS_BY_NAME, +) + +pytestmark = pytest.mark.unit + +# The full specialist roster the main agent composes from: 4 builtins + 15 +# connector routes. Adding/removing a specialist is a deliberate product change +# and must be reflected here. +_EXPECTED_SUBAGENTS = frozenset( + { + "airtable", + "calendar", + "clickup", + "confluence", + "deliverables", + "discord", + "dropbox", + "gmail", + "google_drive", + "jira", + "knowledge_base", + "linear", + "luma", + "memory", + "notion", + "onedrive", + "research", + "slack", + "teams", + } +) + +# Specialists that are always available regardless of connected sources, so they +# carry no required-connector entry. +_CONNECTORLESS = frozenset({"memory", "research"}) + + +def test_registry_contains_exactly_expected_subagents(): + """No specialist is silently added, dropped, or renamed by a move.""" + assert set(SUBAGENT_BUILDERS_BY_NAME) == _EXPECTED_SUBAGENTS + + +def test_every_builder_is_callable_route_agent(): + """Each registry value is a callable defined in its route's ``agent`` module.""" + for name, builder in SUBAGENT_BUILDERS_BY_NAME.items(): + assert callable(builder), f"{name} builder is not callable" + assert builder.__module__.endswith(".agent"), ( + f"{name} builder lives in {builder.__module__}, expected a *.agent module" + ) + + +def test_required_connector_map_covers_connector_subagents(): + """The connector-gating map stays in lockstep with the registry.""" + assert set(SUBAGENT_TO_REQUIRED_CONNECTOR_MAP) == ( + _EXPECTED_SUBAGENTS - _CONNECTORLESS + ) diff --git a/surfsense_backend/tests/unit/agents/test_import_all.py b/surfsense_backend/tests/unit/agents/test_import_all.py new file mode 100644 index 000000000..b45bf3359 --- /dev/null +++ b/surfsense_backend/tests/unit/agents/test_import_all.py @@ -0,0 +1,53 @@ +"""Guardrail A: every agent module (and its prod entrypoints) must import. + +Static reachability analysis and mocked unit tests cannot catch a module that +fails to import after files move or imports are rewritten. This smoke test +imports every submodule under ``app.agents`` plus the production entrypoints +that consume agents, turning a move-time ``ImportError`` into a fast, local CI +signal instead of a runtime failure in prod. +""" + +from __future__ import annotations + +import importlib +import pkgutil + +import pytest + +import app.agents as agents_pkg + +pytestmark = pytest.mark.unit + +# Prod consumers of app.agents that live OUTSIDE the agents tree; a broken +# importer here would not be caught by walking app.agents alone. +_PROD_ENTRYPOINTS = [ + "app.tasks.chat.streaming.flows.new_chat.orchestrator", + "app.tasks.chat.streaming.agent.builder", + "app.gateway.agent_invoke", + "app.routes.new_chat_routes", +] + + +def _iter_agent_modules() -> list[str]: + names: list[str] = [] + + def _record(name: str) -> None: + names.append(name) + + for info in pkgutil.walk_packages( + agents_pkg.__path__, prefix=agents_pkg.__name__ + ".", onerror=_record + ): + names.append(info.name) + return sorted(set(names)) + + +@pytest.mark.parametrize("module_name", _iter_agent_modules()) +def test_agent_module_imports(module_name: str) -> None: + """Importing the module must not raise (no broken or missed imports).""" + importlib.import_module(module_name) + + +@pytest.mark.parametrize("module_name", _PROD_ENTRYPOINTS) +def test_prod_entrypoint_imports(module_name: str) -> None: + """The production code paths that build/invoke agents must import.""" + importlib.import_module(module_name)