diff --git a/api/mcp_server/server.py b/api/mcp_server/server.py index 12ad42e..0b007f2 100644 --- a/api/mcp_server/server.py +++ b/api/mcp_server/server.py @@ -8,6 +8,7 @@ from api.mcp_server.tools.catalog import ( list_tools, ) from api.mcp_server.tools.create_workflow import create_workflow +from api.mcp_server.tools.docs_search import search_docs from api.mcp_server.tools.get_workflow_code import get_workflow_code from api.mcp_server.tools.node_types import get_node_type, list_node_types from api.mcp_server.tools.save_workflow import save_workflow @@ -27,5 +28,6 @@ for _tool in ( list_tools, list_workflows, save_workflow, + search_docs, ): mcp.tool(_tool) diff --git a/api/mcp_server/tools/docs_search.py b/api/mcp_server/tools/docs_search.py new file mode 100644 index 0000000..bd7e955 --- /dev/null +++ b/api/mcp_server/tools/docs_search.py @@ -0,0 +1,312 @@ +"""`search_docs` MCP tool — keyword search over the Mintlify docs tree. + +The docs are shipped into the API image (`COPY ./docs ./docs` in +`api/Dockerfile`), so this tool works for both source/dev runs and +Docker deployments. For source/dev runs we walk up from this file to +locate the `docs/` directory; for Docker we land on `/app/docs`. An +explicit `DOGRAH_DOCS_PATH` env var overrides discovery. + +The implementation is intentionally dependency-free: it does in-memory +keyword scoring rather than building a vector index. The docs corpus is +small (~100 .mdx files, ~140k LoC), so a per-call scan is well under +50 ms and avoids needing an embedding backend, vector store, or +background indexer for a tool that's called interactively from MCP. +""" + +from __future__ import annotations + +import os +import re +from functools import lru_cache +from pathlib import Path + +from api.mcp_server.auth import authenticate_mcp_request +from api.mcp_server.tracing import traced_tool + +# Public site for the rendered docs. Used to build a clickable URL per +# result; agents can hand the URL back to the user even if the local +# file isn't reachable. +DOCS_SITE_BASE_URL = "https://docs.dograh.com" + +# Hard cap regardless of caller-supplied limit. Keeps the MCP response +# payload bounded; Mintlify search APIs use a similar 10-25 ceiling. +DOCS_SEARCH_MAX_LIMIT = 25 + +# Heading-detection regex. Matches ATX headings (`# `, `## `, etc.) but +# not in-line `#` characters. +_HEADING_RE = re.compile(r"^(#{1,6})\s+(.*?)\s*$", re.MULTILINE) + + +def _resolve_docs_root() -> Path | None: + """Return the path to the on-disk docs tree, or None if not found. + + Resolution order: + 1. ``DOGRAH_DOCS_PATH`` env var (absolute path). + 2. ``/app/docs`` — the location the API Dockerfile copies docs to. + 3. Walk upward from this file looking for a sibling ``docs/`` dir + (covers source-checkout / dev runs). + """ + override = os.environ.get("DOGRAH_DOCS_PATH") + if override: + candidate = Path(override).expanduser().resolve() + if candidate.is_dir(): + return candidate + + docker_default = Path("/app/docs") + if docker_default.is_dir(): + return docker_default + + # Walk up from .../api/mcp_server/tools/docs_search.py looking for docs/. + for parent in Path(__file__).resolve().parents: + candidate = parent / "docs" + if candidate.is_dir(): + return candidate + + return None + + +@lru_cache(maxsize=1) +def _docs_corpus() -> tuple[tuple[str, str], ...]: + """Load the docs corpus once per process. + + Returns a tuple of ``(relative_path, file_contents)`` pairs. The + docs tree is small and read-mostly at runtime, so caching the full + text in memory is cheaper than re-reading on every search. + Cache miss is intentional when ``DOGRAH_DOCS_PATH`` flips at + startup — for live edits, restart the process. + """ + root = _resolve_docs_root() + if root is None: + return () + + pairs: list[tuple[str, str]] = [] + for path in sorted(root.rglob("*")): + if not path.is_file(): + continue + if path.suffix.lower() not in {".mdx", ".md"}: + continue + try: + contents = path.read_text(encoding="utf-8") + except (OSError, UnicodeDecodeError): + # Skip unreadable files rather than crashing the whole tool. + continue + rel = path.relative_to(root).as_posix() + pairs.append((rel, contents)) + return tuple(pairs) + + +def _tokenize_query(query: str) -> list[str]: + """Split a user query into lowercased keyword terms. + + Empty strings and 1-char filler terms are dropped — they would + match almost every file and drown out the real signal. + """ + terms = re.findall(r"[A-Za-z0-9_]+", query.lower()) + return [term for term in terms if len(term) >= 2] + + +def _extract_page_title(contents: str, fallback: str) -> str: + """Pull a human-readable title for a docs page. + + Mintlify pages start with a YAML frontmatter block whose ``title`` + is the most authoritative title; fall back to the first ATX heading + if frontmatter is missing or malformed; fall back to the filename + if no heading exists. + """ + if contents.startswith("---"): + end = contents.find("---", 3) + if end != -1: + frontmatter = contents[3:end] + for line in frontmatter.splitlines(): + line = line.strip() + if line.lower().startswith("title:"): + value = line.split(":", 1)[1].strip() + # Strip surrounding quotes if Mintlify wrote them. + if ( + len(value) >= 2 + and value[0] == value[-1] + and value[0] in ('"', "'") + ): + value = value[1:-1] + if value: + return value + + match = _HEADING_RE.search(contents) + if match: + return match.group(2).strip() + + return fallback + + +def _strip_frontmatter(contents: str) -> str: + """Drop the YAML frontmatter block from a docs page body.""" + if not contents.startswith("---"): + return contents + end = contents.find("---", 3) + if end == -1: + return contents + return contents[end + 3 :].lstrip("\n") + + +def _build_snippet(body: str, terms: list[str], snippet_radius: int = 120) -> str: + """Return a ~240-char window around the first term hit in ``body``. + + The window is centered on the earliest match (whichever term comes + first wins) so the snippet shows context for the strongest signal, + not the lexicographically-first term. Leading/trailing newlines are + collapsed so the snippet renders cleanly through MCP's text payload. + """ + body_lower = body.lower() + earliest = -1 + for term in terms: + idx = body_lower.find(term) + if idx != -1 and (earliest == -1 or idx < earliest): + earliest = idx + + if earliest == -1: + # No hit in body — the match must have come from the title or + # path, so just return the first line of body as orientation. + first_line = next( + (line.strip() for line in body.splitlines() if line.strip()), + "", + ) + return first_line[: snippet_radius * 2] + + start = max(0, earliest - snippet_radius) + end = min(len(body), earliest + snippet_radius) + snippet = body[start:end] + # Collapse all whitespace runs (incl. internal newlines) for a + # single-line snippet — MCP renders text payloads inline. + snippet = " ".join(snippet.split()) + prefix = "…" if start > 0 else "" + suffix = "…" if end < len(body) else "" + return f"{prefix}{snippet}{suffix}" + + +def _score_page( + rel_path: str, + title: str, + body: str, + terms: list[str], +) -> int: + """Weighted keyword score for a single docs page. + + Title/path matches outweigh body matches because they encode the + page's purpose, not just incidental mentions. Each query term + contributes independently — a page matching all terms ranks above + one matching a single term many times. + """ + if not terms: + return 0 + score = 0 + path_lower = rel_path.lower() + title_lower = title.lower() + body_lower = body.lower() + for term in terms: + path_hits = path_lower.count(term) + title_hits = title_lower.count(term) + body_hits = body_lower.count(term) + if path_hits == 0 and title_hits == 0 and body_hits == 0: + # Penalize pages that miss any query term — they probably + # aren't what the caller wants. + continue + # Diminishing returns past a few hits per term: 1 dominant page + # shouldn't outweigh a page that hits every term. The cap is + # deliberately set so ``title_weight (5)`` strictly exceeds + # ``body_cap (4) × body_weight (1)`` — a page whose TITLE is the + # term must outrank a page that merely mentions it repeatedly. + body_hits = min(body_hits, 4) + score += path_hits * 8 + title_hits * 5 + body_hits + return score + + +def _docs_url_for(rel_path: str) -> str: + """Build the public docs URL for a relative on-disk path.""" + # Strip the extension and `index` so `getting-started/index.mdx` + # maps to `/getting-started`, matching Mintlify's routing. + no_ext = re.sub(r"\.(mdx|md)$", "", rel_path, flags=re.IGNORECASE) + if no_ext.endswith("/index"): + no_ext = no_ext[: -len("/index")] + return f"{DOCS_SITE_BASE_URL}/{no_ext}".rstrip("/") + + +@traced_tool +async def search_docs(query: str, limit: int = 10) -> list[dict]: + """Search the Dograh documentation by keyword and return ranked pages. + + Use this when the caller asks "how do I configure X" / "where are the docs for Y" / + "what does Dograh say about Z" — anything that should land on a docs page + rather than a workspace resource. For workspace data (agents, recordings, + credentials), use ``list_workflows`` / ``list_recordings`` / ``list_credentials`` + instead. + + Args: + query: Free-form keywords (e.g. "TURN server", "elevenlabs voice"). + Tokenized on non-alphanumeric characters; terms shorter than + 2 characters are dropped. + limit: Max pages to return. Capped at 25 regardless of input; + default 10 keeps the payload small enough to inline in MCP. + + Returns: + Up to ``limit`` results, sorted by descending relevance score. + Each entry has: + * ``path`` — repo-relative path (e.g. ``configurations/voice.mdx``) + * ``url`` — public docs URL (https://docs.dograh.com/...) + * ``title`` — page title (from Mintlify frontmatter when present) + * ``score`` — opaque integer relevance score + * ``snippet`` — ~240-char excerpt around the first term hit + """ + # Authentication is consistent with the rest of the MCP tools and + # routes through the same rate-limiting path, even though docs are + # not org-scoped data. + await authenticate_mcp_request() + + if not isinstance(query, str) or not query.strip(): + raise ValueError("query must be a non-empty string.") + + try: + effective_limit = int(limit) + except (TypeError, ValueError) as exc: + raise ValueError("limit must be an integer.") from exc + if effective_limit < 1: + raise ValueError("limit must be at least 1.") + effective_limit = min(effective_limit, DOCS_SEARCH_MAX_LIMIT) + + terms = _tokenize_query(query) + if not terms: + # The caller passed something like punctuation-only or only + # single-char tokens — surface an actionable error rather than + # silently returning everything. + raise ValueError( + "query must contain at least one keyword of 2+ alphanumeric characters." + ) + + corpus = _docs_corpus() + if not corpus: + # Tool is registered but docs aren't on disk — return empty + # rather than 500ing so the caller can degrade gracefully. + return [] + + scored: list[tuple[int, str, str, str]] = [] + for rel_path, contents in corpus: + title = _extract_page_title(contents, fallback=rel_path) + body = _strip_frontmatter(contents) + score = _score_page(rel_path, title, body, terms) + if score <= 0: + continue + scored.append((score, rel_path, title, body)) + + scored.sort(key=lambda item: (-item[0], item[1])) + + results: list[dict] = [] + for score, rel_path, title, body in scored[:effective_limit]: + results.append( + { + "path": rel_path, + "url": _docs_url_for(rel_path), + "title": title, + "score": score, + "snippet": _build_snippet(body, terms), + } + ) + return results diff --git a/api/tests/test_mcp_docs_search.py b/api/tests/test_mcp_docs_search.py new file mode 100644 index 0000000..8b12571 --- /dev/null +++ b/api/tests/test_mcp_docs_search.py @@ -0,0 +1,294 @@ +"""Unit tests for the `search_docs` MCP tool. + +The tool reads the docs corpus from disk via ``_resolve_docs_root`` and +caches it with ``functools.lru_cache``. These tests point the cache at +a synthetic corpus per-test so the assertions don't depend on the real +docs tree (which evolves) and the LRU cache doesn't leak state. + +`authenticate_mcp_request` is mocked so the tests don't need a live DB +or a valid API key — mirroring the pattern in +``test_mcp_save_workflow.py``. +""" + +from __future__ import annotations + +import os +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest + +from api.mcp_server.tools import docs_search as docs_search_module +from api.mcp_server.tools.docs_search import ( + _docs_url_for, + _extract_page_title, + _resolve_docs_root, + _score_page, + _strip_frontmatter, + _tokenize_query, + search_docs, +) + + +# ─── Fixtures ──────────────────────────────────────────────────────────── + + +@pytest.fixture +def fake_docs_root(tmp_path: Path) -> Path: + """Build a minimal docs tree on disk and point the tool at it.""" + docs_root = tmp_path / "docs" + docs_root.mkdir() + + (docs_root / "configurations").mkdir() + (docs_root / "configurations" / "voice.mdx").write_text( + "---\n" + 'title: "Voice"\n' + "---\n\n" + "# Voice configuration\n\n" + "Dograh supports ElevenLabs and Cartesia TTS providers.\n" + "Configure the ElevenLabs voice_id in your workspace settings.\n", + encoding="utf-8", + ) + (docs_root / "configurations" / "transcriber.mdx").write_text( + "---\n" + 'title: "Transcriber"\n' + "---\n\n" + "# Speech-to-text\n\nDeepgram is the default transcriber.\n", + encoding="utf-8", + ) + + (docs_root / "deployment").mkdir() + (docs_root / "deployment" / "turn-server.mdx").write_text( + "---\n" + 'title: "TURN server setup"\n' + "---\n\n" + "# TURN server\n\n" + "WebRTC requires a TURN server for NAT traversal. Coturn is the " + "recommended choice for self-hosted deployments.\n", + encoding="utf-8", + ) + + # A non-doc file that must be ignored by the corpus loader. + (docs_root / "docs.json").write_text('{"name":"Dograh"}', encoding="utf-8") + + # Reset the LRU cache and pin the resolver to our tmp tree. + docs_search_module._docs_corpus.cache_clear() + with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(docs_root)}): + yield docs_root + docs_search_module._docs_corpus.cache_clear() + + +@pytest.fixture +def authed_user(): + """Stub ``authenticate_mcp_request`` so tests skip the API-key path.""" + + class _FakeUser: + selected_organization_id = 1 + id = 42 + + with patch( + "api.mcp_server.tools.docs_search.authenticate_mcp_request", + new=AsyncMock(return_value=_FakeUser()), + ): + yield _FakeUser() + + +# ─── Pure helpers ──────────────────────────────────────────────────────── + + +def test_tokenize_query_strips_short_and_punct_terms(): + """Punctuation and 1-char tokens must not bleed into the scorer. + + A trailing `?` or stray `a` would otherwise match nearly every page + and flatten the relevance ranking. + """ + assert _tokenize_query("How do I configure a TURN server?") == [ + "how", + "do", + "configure", + "turn", + "server", + ] + + +def test_tokenize_query_empty_input_returns_empty(): + assert _tokenize_query("") == [] + assert _tokenize_query("?? // !!") == [] + + +def test_strip_frontmatter_removes_yaml_block(): + body = '---\ntitle: "X"\n---\n\n# Heading\n' + assert _strip_frontmatter(body).startswith("# Heading") + + +def test_strip_frontmatter_passes_through_when_missing(): + body = "# Just a heading\nbody text\n" + assert _strip_frontmatter(body) == body + + +def test_extract_page_title_prefers_frontmatter(): + body = '---\ntitle: "Front Title"\n---\n\n# Heading Title\n' + assert _extract_page_title(body, fallback="x.mdx") == "Front Title" + + +def test_extract_page_title_falls_back_to_first_heading(): + """When frontmatter is missing the first ATX heading is the next best + signal — better than just returning the filename, which often is + a slug not a human-readable title.""" + body = "# Heading Title\nbody\n" + assert _extract_page_title(body, fallback="x.mdx") == "Heading Title" + + +def test_extract_page_title_falls_back_to_filename_when_nothing_matches(): + body = "plain prose with no heading or frontmatter" + assert _extract_page_title(body, fallback="x.mdx") == "x.mdx" + + +def test_docs_url_for_strips_extension_and_index(): + assert ( + _docs_url_for("configurations/voice.mdx") + == "https://docs.dograh.com/configurations/voice" + ) + assert ( + _docs_url_for("getting-started/index.mdx") + == "https://docs.dograh.com/getting-started" + ) + + +def test_score_page_weights_title_above_body(): + """Title hits must outweigh body hits — otherwise a long page that + incidentally mentions the term many times outranks the page whose + purpose IS the term.""" + title_only = _score_page( + rel_path="other.mdx", title="TURN server", body="unrelated text", terms=["turn"] + ) + body_only = _score_page( + rel_path="other.mdx", + title="Unrelated", + body="turn turn turn turn turn", + terms=["turn"], + ) + assert title_only > body_only + + +def test_score_page_returns_zero_when_no_terms_match(): + assert ( + _score_page( + rel_path="x.mdx", title="X", body="hello world", terms=["nonexistent"] + ) + == 0 + ) + + +def test_resolve_docs_root_honors_env_override(tmp_path: Path): + docs = tmp_path / "custom_docs" + docs.mkdir() + with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(docs)}): + assert _resolve_docs_root() == docs.resolve() + + +def test_resolve_docs_root_ignores_nonexistent_env_value(tmp_path: Path): + """A bogus env value must not crash the tool — fall back to discovery + (the real ``docs/`` in the repo) instead.""" + with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(tmp_path / "nope")}): + # Walk-up discovery should land somewhere (the repo's actual docs) + # but we don't assert the exact path because it depends on where + # the tests are run; we just assert no crash and either None or a dir. + resolved = _resolve_docs_root() + assert resolved is None or resolved.is_dir() + + +# ─── End-to-end tool behaviour ─────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_search_docs_ranks_turn_setup_first_for_turn_query( + fake_docs_root, authed_user +): + """The page whose title and body are both about TURN must outrank + incidental mentions of related words on other pages.""" + results = await search_docs("How do I set up a TURN server?") + assert results, "expected at least one result" + assert results[0]["path"] == "deployment/turn-server.mdx" + assert results[0]["url"] == "https://docs.dograh.com/deployment/turn-server" + assert "TURN server" in results[0]["title"] + assert "TURN" in results[0]["snippet"] or "turn" in results[0]["snippet"].lower() + + +@pytest.mark.asyncio +async def test_search_docs_excludes_non_doc_files(fake_docs_root, authed_user): + """``docs.json`` must not appear — the corpus loader filters to + .mdx/.md only.""" + results = await search_docs("Dograh") + paths = [r["path"] for r in results] + assert "docs.json" not in paths + + +@pytest.mark.asyncio +async def test_search_docs_returns_empty_when_no_match(fake_docs_root, authed_user): + results = await search_docs("xyzzy unrelated zzz") + assert results == [] + + +@pytest.mark.asyncio +async def test_search_docs_respects_limit(fake_docs_root, authed_user): + """``limit=1`` must collapse the result list even if multiple pages + match.""" + results = await search_docs("Dograh", limit=1) + assert len(results) == 1 + + +@pytest.mark.asyncio +async def test_search_docs_clamps_limit_to_hard_cap(fake_docs_root, authed_user): + """A pathological large limit must be clamped to + ``DOCS_SEARCH_MAX_LIMIT`` (=25) so the payload stays bounded.""" + # Drop in extra docs so there's headroom to verify the clamp. + for i in range(30): + (fake_docs_root / f"extra-{i}.mdx").write_text( + f"# Page {i}\nThis Dograh page covers configurations topic {i}.\n", + encoding="utf-8", + ) + docs_search_module._docs_corpus.cache_clear() + results = await search_docs("Dograh", limit=999) + assert len(results) <= 25 + + +@pytest.mark.asyncio +async def test_search_docs_returns_empty_when_no_corpus( + tmp_path, authed_user, monkeypatch +): + """If the docs directory doesn't exist on disk, the tool must + degrade to an empty list rather than raising — Docker images and + dev checkouts can disagree on layout.""" + nonexistent = tmp_path / "no-docs-here" + monkeypatch.setenv("DOGRAH_DOCS_PATH", str(nonexistent)) + # Also block the walk-up fallback by pointing the resolver at a + # tmp path with no `docs/` ancestor. + docs_search_module._docs_corpus.cache_clear() + with patch( + "api.mcp_server.tools.docs_search._resolve_docs_root", return_value=None + ): + results = await search_docs("anything") + assert results == [] + + +@pytest.mark.asyncio +async def test_search_docs_rejects_empty_query(fake_docs_root, authed_user): + with pytest.raises(ValueError, match="non-empty string"): + await search_docs("") + + +@pytest.mark.asyncio +async def test_search_docs_rejects_query_with_no_real_terms( + fake_docs_root, authed_user +): + """A query like `"???"` tokenizes to nothing — surface an actionable + error rather than silently returning every page.""" + with pytest.raises(ValueError, match="2\\+ alphanumeric"): + await search_docs("?? // !!") + + +@pytest.mark.asyncio +async def test_search_docs_rejects_zero_limit(fake_docs_root, authed_user): + with pytest.raises(ValueError, match="at least 1"): + await search_docs("Dograh", limit=0)