mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-07 07:55:16 +02:00
The MCP `instructions` hint is static and baked into the client prompt, while tool names, signatures, and error codes are discovered dynamically via tools/list. The two had drifted: instructions restated stale signatures and an error-code enum that omitted schema_validation and trigger_path_conflict. - Trim instructions.py to tool names + call order; stop restating signatures and error codes the dynamic surface already carries. - Document each tool's full error_code contract in the save_workflow and create_workflow docstrings (the descriptions shipped via tools/list). - Add test_mcp_instructions_drift.py: every tool named in the guide must be registered, and every error_code a tool returns must appear in its description. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
115 lines
4.3 KiB
Python
115 lines
4.3 KiB
Python
"""Drift guards between the static MCP guide and the live tool surface.
|
|
|
|
`api/mcp_server/instructions.py` is free text baked into the client
|
|
system prompt. It is *not* the authoritative description of the tools —
|
|
names, signatures, and per-tool error codes reach the model dynamically
|
|
via `tools/list`, derived from each tool's own function signature and
|
|
docstring. These tests fail on the two classic drift modes:
|
|
|
|
1. The guide references a tool that is no longer registered (renamed or
|
|
removed) — the model would be told to call something that 404s.
|
|
2. A tool returns an `error_code` that is absent from the description it
|
|
ships via `tools/list` — the model can't learn to recover from it.
|
|
|
|
Keep the guide about orchestration (call order, hard constraints) and let
|
|
the tools describe themselves.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from api.mcp_server import instructions as instructions_module
|
|
from api.mcp_server.server import mcp
|
|
from api.mcp_server.tools import create_workflow as create_workflow_module
|
|
from api.mcp_server.tools import save_workflow as save_workflow_module
|
|
|
|
# Every registered MCP tool name starts with one of these verbs. A
|
|
# backticked snake_case token in the guide whose leading word is a verb is
|
|
# treated as a tool reference; field/reference names like `tool_refs`,
|
|
# `credential_ref`, or `pre_call_fetch` don't start with a verb and are
|
|
# correctly ignored. Extend this only when a new tool introduces a new
|
|
# leading verb (a missing verb under-checks, it never false-fails).
|
|
_TOOL_VERB_PREFIXES = frozenset(
|
|
{
|
|
"search",
|
|
"read",
|
|
"list",
|
|
"get",
|
|
"save",
|
|
"create",
|
|
"update",
|
|
"delete",
|
|
"add",
|
|
"remove",
|
|
"set",
|
|
}
|
|
)
|
|
|
|
# A backtick immediately followed by a snake_case identifier (>= 1
|
|
# underscore). Anchoring on the opening backtick captures the leading
|
|
# identifier of a code span whether it is bare (`read_doc`) or a call
|
|
# (`read_doc(path)`), while skipping DSL constructs like `wf.edge` or
|
|
# `new Workflow` whose first char after the backtick isn't `[a-z_]`.
|
|
_BACKTICKED_SNAKE_RE = re.compile(r"`([a-z][a-z0-9]*(?:_[a-z0-9]+)+)")
|
|
|
|
# Error codes are emitted as the first string arg to `_error_result(...)`.
|
|
_ERROR_RESULT_LITERAL_RE = re.compile(r'_error_result\(\s*"([a-z_]+)"')
|
|
# `parse_error` / `validation_error` are picked by a `code_key` ternary
|
|
# rather than passed as a literal to `_error_result`, so match them too.
|
|
_CODE_KEY_LITERAL_RE = re.compile(r'"(parse_error|validation_error)"')
|
|
|
|
|
|
def _referenced_tool_names(text: str) -> set[str]:
|
|
return {
|
|
token
|
|
for token in _BACKTICKED_SNAKE_RE.findall(text)
|
|
if token.split("_", 1)[0] in _TOOL_VERB_PREFIXES
|
|
}
|
|
|
|
|
|
def _returned_error_codes(module) -> set[str]:
|
|
source = Path(module.__file__).read_text(encoding="utf-8")
|
|
return set(_ERROR_RESULT_LITERAL_RE.findall(source)) | set(
|
|
_CODE_KEY_LITERAL_RE.findall(source)
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_guide_only_references_registered_tools():
|
|
registered = {tool.name for tool in await mcp.list_tools()}
|
|
referenced = _referenced_tool_names(instructions_module.DOGRAH_MCP_INSTRUCTIONS)
|
|
|
|
assert referenced, "no tool references extracted — the regex likely broke"
|
|
unknown = sorted(referenced - registered)
|
|
assert not unknown, (
|
|
f"instructions.py references tools that are not registered: {unknown}. "
|
|
f"Rename/remove the reference or register the tool. "
|
|
f"Registered tools: {sorted(registered)}."
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"tool_name, module",
|
|
[
|
|
("save_workflow", save_workflow_module),
|
|
("create_workflow", create_workflow_module),
|
|
],
|
|
)
|
|
async def test_tool_documents_every_error_code_it_returns(tool_name, module):
|
|
descriptions = {
|
|
tool.name: tool.description or "" for tool in await mcp.list_tools()
|
|
}
|
|
description = descriptions[tool_name]
|
|
returned = _returned_error_codes(module)
|
|
|
|
assert returned, f"no error codes detected in {tool_name} source — regex broke"
|
|
undocumented = sorted(code for code in returned if code not in description)
|
|
assert not undocumented, (
|
|
f"{tool_name} returns error_code(s) {undocumented} absent from the description "
|
|
f"shipped via tools/list. Document them in the {tool_name} docstring."
|
|
)
|