diff --git a/api/mcp_server/instructions.py b/api/mcp_server/instructions.py index f0b2618..00a0fd4 100644 --- a/api/mcp_server/instructions.py +++ b/api/mcp_server/instructions.py @@ -16,6 +16,11 @@ You build and edit Dograh voice-AI workflows by emitting TypeScript that uses th ## Call order +### Reading documentation +1. `search_docs(query)` — use first for keyword or acronym lookup when the user is asking how Dograh works or how to configure something. +2. `read_doc(path)` — fetch the full page once one result looks likely. Prefer this over reasoning from search summaries alone. +3. `list_docs(path=None, depth=1)` — use when the user wants to browse a topic area or when search terms are too vague. Returned section paths feed back into `list_docs`; returned page paths feed into `read_doc`. + ### Editing an existing workflow 1. `list_workflows` — locate the target workflow. 2. `get_workflow_code(workflow_id)` — fetch the current source. diff --git a/api/mcp_server/server.py b/api/mcp_server/server.py index 0b007f2..5deef6c 100644 --- a/api/mcp_server/server.py +++ b/api/mcp_server/server.py @@ -1,4 +1,5 @@ from fastmcp import FastMCP +from mcp.types import ToolAnnotations from api.mcp_server.instructions import DOGRAH_MCP_INSTRUCTIONS from api.mcp_server.tools.catalog import ( @@ -8,7 +9,7 @@ from api.mcp_server.tools.catalog import ( list_tools, ) from api.mcp_server.tools.create_workflow import create_workflow -from api.mcp_server.tools.docs_search import search_docs +from api.mcp_server.tools.docs_search import list_docs, read_doc, search_docs from api.mcp_server.tools.get_workflow_code import get_workflow_code from api.mcp_server.tools.node_types import get_node_type, list_node_types from api.mcp_server.tools.save_workflow import save_workflow @@ -28,6 +29,15 @@ for _tool in ( list_tools, list_workflows, save_workflow, - search_docs, ): mcp.tool(_tool) + +_DOCS_TOOL_ANNOTATIONS = ToolAnnotations( + readOnlyHint=True, + idempotentHint=True, + destructiveHint=False, + openWorldHint=False, +) + +for _tool in (list_docs, read_doc, search_docs): + mcp.tool(_tool, annotations=_DOCS_TOOL_ANNOTATIONS) diff --git a/api/mcp_server/tools/docs_search.py b/api/mcp_server/tools/docs_search.py index bd7e955..b679e44 100644 --- a/api/mcp_server/tools/docs_search.py +++ b/api/mcp_server/tools/docs_search.py @@ -1,312 +1,704 @@ -"""`search_docs` MCP tool — keyword search over the Mintlify docs tree. +"""MCP docs discovery tools over the Mintlify docs tree. -The docs are shipped into the API image (`COPY ./docs ./docs` in -`api/Dockerfile`), so this tool works for both source/dev runs and -Docker deployments. For source/dev runs we walk up from this file to -locate the `docs/` directory; for Docker we land on `/app/docs`. An -explicit `DOGRAH_DOCS_PATH` env var overrides discovery. +The docs surface is intentionally split into three steps: -The implementation is intentionally dependency-free: it does in-memory -keyword scoring rather than building a vector index. The docs corpus is -small (~100 .mdx files, ~140k LoC), so a per-call scan is well under -50 ms and avoids needing an embedding backend, vector store, or -background indexer for a tool that's called interactively from MCP. +- ``list_docs`` for lightweight navigation over the published hierarchy +- ``search_docs`` for keyword lookup across the visible docs catalog +- ``read_doc`` for the full content of one chosen page (or one section) + +The runtime index is derived from ``docs/docs.json`` plus the referenced +``.mdx``/``.md`` files. That keeps navigation, ordering, and visibility in +sync with the published docs rather than indexing every file under ``docs/``. """ from __future__ import annotations +import json import os import re +from collections import Counter +from dataclasses import dataclass, replace from functools import lru_cache from pathlib import Path +from typing import Any + +import yaml +from fastapi import HTTPException from api.mcp_server.auth import authenticate_mcp_request from api.mcp_server.tracing import traced_tool -# Public site for the rendered docs. Used to build a clickable URL per -# result; agents can hand the URL back to the user even if the local -# file isn't reachable. -DOCS_SITE_BASE_URL = "https://docs.dograh.com" - -# Hard cap regardless of caller-supplied limit. Keeps the MCP response -# payload bounded; Mintlify search APIs use a similar 10-25 ceiling. DOCS_SEARCH_MAX_LIMIT = 25 +DOCS_LIST_MAX_DEPTH = 3 +_ROOT_SECTION_PATH = "__root__" -# Heading-detection regex. Matches ATX headings (`# `, `## `, etc.) but -# not in-line `#` characters. +_TOKEN_RE = re.compile(r"[A-Za-z0-9_]+") +_FRONTMATTER_RE = re.compile(r"\A---\s*\n(.*?)\n---\s*\n?", re.DOTALL) _HEADING_RE = re.compile(r"^(#{1,6})\s+(.*?)\s*$", re.MULTILINE) +_STOPWORDS = { + "a", + "an", + "and", + "are", + "at", + "be", + "by", + "can", + "do", + "for", + "from", + "how", + "i", + "if", + "in", + "is", + "it", + "me", + "my", + "of", + "on", + "or", + "the", + "to", + "what", + "when", + "where", + "with", + "you", + "your", +} + + +@dataclass(frozen=True) +class DocSection: + title: str + slug: str + level: int + content: str + + +@dataclass(frozen=True) +class DocPage: + path: str + file_path: str + title: str + description: str + llm_hint: str + aliases: tuple[str, ...] + breadcrumb: tuple[str, ...] + content: str + sections: tuple[DocSection, ...] + order: int + + def breadcrumb_text(self) -> str: + return " > ".join(self.breadcrumb) + + def routing_hint(self) -> str: + return self.llm_hint or self.description + + def to_catalog_dict(self, section: DocSection | None = None) -> dict: + data = { + "kind": "page", + "path": self.path, + "title": self.title, + "breadcrumb": self.breadcrumb_text(), + "llm_hint": self.routing_hint(), + } + if section is not None: + data["section_title"] = section.title + data["section_slug"] = section.slug + return _compact_dict(data) + + def to_read_dict(self, section: DocSection | None = None) -> dict: + active_section = section + content = self.content + if active_section is not None: + content = active_section.content + + return _compact_dict( + { + "path": self.path, + "title": self.title, + "breadcrumb": self.breadcrumb_text(), + "llm_hint": self.routing_hint(), + "section_title": active_section.title if active_section else None, + "section_slug": active_section.slug if active_section else None, + "content": content, + "sections": [ + {"title": sec.title, "slug": sec.slug} + for sec in self.sections + if sec.title and sec.slug + ], + } + ) + + +@dataclass(frozen=True) +class NavSection: + path: str + title: str + breadcrumb: tuple[str, ...] + children: tuple[tuple[str, str], ...] + descendant_page_count: int = 0 + + def breadcrumb_text(self) -> str: + return " > ".join(self.breadcrumb) + + def to_mcp_dict(self) -> dict: + hint = None + if self.descendant_page_count: + hint = f"Browse {self.descendant_page_count} docs in this section." + return _compact_dict( + { + "kind": "section", + "path": self.path, + "title": self.title, + "breadcrumb": self.breadcrumb_text(), + "llm_hint": hint, + "has_children": bool(self.children), + "child_count": len(self.children), + "page_count": self.descendant_page_count, + } + ) + + +@dataclass(frozen=True) +class DocsIndex: + pages_by_path: dict[str, DocPage] + sections_by_path: dict[str, NavSection] + + +def _compact_dict(data: dict[str, Any]) -> dict[str, Any]: + return { + key: value for key, value in data.items() if value not in (None, "", [], (), {}) + } + + +def _slugify(value: str) -> str: + slug = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-") + return slug or "section" + + +def _coerce_docs_root(candidate: Path) -> Path | None: + candidate = candidate.expanduser().resolve() + if (candidate / "docs.json").is_file(): + return candidate + nested = candidate / "docs" + if (nested / "docs.json").is_file(): + return nested + return None def _resolve_docs_root() -> Path | None: - """Return the path to the on-disk docs tree, or None if not found. - - Resolution order: - 1. ``DOGRAH_DOCS_PATH`` env var (absolute path). - 2. ``/app/docs`` — the location the API Dockerfile copies docs to. - 3. Walk upward from this file looking for a sibling ``docs/`` dir - (covers source-checkout / dev runs). - """ + """Return the path to the on-disk docs tree, or None if not found.""" override = os.environ.get("DOGRAH_DOCS_PATH") if override: - candidate = Path(override).expanduser().resolve() - if candidate.is_dir(): - return candidate + resolved = _coerce_docs_root(Path(override)) + if resolved is not None: + return resolved - docker_default = Path("/app/docs") - if docker_default.is_dir(): + docker_default = _coerce_docs_root(Path("/app/docs")) + if docker_default is not None: return docker_default - # Walk up from .../api/mcp_server/tools/docs_search.py looking for docs/. for parent in Path(__file__).resolve().parents: - candidate = parent / "docs" - if candidate.is_dir(): - return candidate + resolved = _coerce_docs_root(parent / "docs") + if resolved is not None: + return resolved return None -@lru_cache(maxsize=1) -def _docs_corpus() -> tuple[tuple[str, str], ...]: - """Load the docs corpus once per process. - - Returns a tuple of ``(relative_path, file_contents)`` pairs. The - docs tree is small and read-mostly at runtime, so caching the full - text in memory is cheaper than re-reading on every search. - Cache miss is intentional when ``DOGRAH_DOCS_PATH`` flips at - startup — for live edits, restart the process. - """ - root = _resolve_docs_root() - if root is None: - return () - - pairs: list[tuple[str, str]] = [] - for path in sorted(root.rglob("*")): - if not path.is_file(): - continue - if path.suffix.lower() not in {".mdx", ".md"}: - continue - try: - contents = path.read_text(encoding="utf-8") - except (OSError, UnicodeDecodeError): - # Skip unreadable files rather than crashing the whole tool. - continue - rel = path.relative_to(root).as_posix() - pairs.append((rel, contents)) - return tuple(pairs) - - -def _tokenize_query(query: str) -> list[str]: - """Split a user query into lowercased keyword terms. - - Empty strings and 1-char filler terms are dropped — they would - match almost every file and drown out the real signal. - """ - terms = re.findall(r"[A-Za-z0-9_]+", query.lower()) - return [term for term in terms if len(term) >= 2] - - -def _extract_page_title(contents: str, fallback: str) -> str: - """Pull a human-readable title for a docs page. - - Mintlify pages start with a YAML frontmatter block whose ``title`` - is the most authoritative title; fall back to the first ATX heading - if frontmatter is missing or malformed; fall back to the filename - if no heading exists. - """ - if contents.startswith("---"): - end = contents.find("---", 3) - if end != -1: - frontmatter = contents[3:end] - for line in frontmatter.splitlines(): - line = line.strip() - if line.lower().startswith("title:"): - value = line.split(":", 1)[1].strip() - # Strip surrounding quotes if Mintlify wrote them. - if ( - len(value) >= 2 - and value[0] == value[-1] - and value[0] in ('"', "'") - ): - value = value[1:-1] - if value: - return value - - match = _HEADING_RE.search(contents) - if match: - return match.group(2).strip() - - return fallback +def _split_frontmatter(contents: str) -> tuple[dict[str, Any], str]: + match = _FRONTMATTER_RE.match(contents) + if not match: + return {}, contents + try: + frontmatter = yaml.safe_load(match.group(1)) or {} + except yaml.YAMLError: + return {}, contents + if not isinstance(frontmatter, dict): + frontmatter = {} + return frontmatter, contents[match.end() :].lstrip("\n") def _strip_frontmatter(contents: str) -> str: """Drop the YAML frontmatter block from a docs page body.""" - if not contents.startswith("---"): - return contents - end = contents.find("---", 3) - if end == -1: - return contents - return contents[end + 3 :].lstrip("\n") + return _split_frontmatter(contents)[1] -def _build_snippet(body: str, terms: list[str], snippet_radius: int = 120) -> str: - """Return a ~240-char window around the first term hit in ``body``. +def _clean_heading_text(raw: str) -> str: + text = re.sub(r"\s*\{#.*\}\s*$", "", raw.strip()) + return " ".join(text.split()) - The window is centered on the earliest match (whichever term comes - first wins) so the snippet shows context for the strongest signal, - not the lexicographically-first term. Leading/trailing newlines are - collapsed so the snippet renders cleanly through MCP's text payload. - """ - body_lower = body.lower() - earliest = -1 - for term in terms: - idx = body_lower.find(term) - if idx != -1 and (earliest == -1 or idx < earliest): - earliest = idx - if earliest == -1: - # No hit in body — the match must have come from the title or - # path, so just return the first line of body as orientation. - first_line = next( - (line.strip() for line in body.splitlines() if line.strip()), - "", +def _extract_page_title(contents: str, fallback: str) -> str: + """Pull a human-readable title for a docs page.""" + frontmatter, body = _split_frontmatter(contents) + title = frontmatter.get("title") + if isinstance(title, str) and title.strip(): + return title.strip() + + match = _HEADING_RE.search(body) + if match: + return _clean_heading_text(match.group(2)) + + return fallback + + +def _normalize_text(value: Any) -> str: + if isinstance(value, str): + return " ".join(value.strip().split()) + return "" + + +def _normalize_aliases(value: Any) -> tuple[str, ...]: + if isinstance(value, str): + aliases = [value] + elif isinstance(value, list): + aliases = [item for item in value if isinstance(item, str)] + else: + aliases = [] + return tuple(alias.strip() for alias in aliases if alias.strip()) + + +def _extract_sections(body: str) -> tuple[DocSection, ...]: + matches = list(_HEADING_RE.finditer(body)) + stripped_body = body.strip() + if not matches: + if not stripped_body: + return () + return ( + DocSection( + title="Overview", + slug="overview", + level=1, + content=stripped_body, + ), ) - return first_line[: snippet_radius * 2] - start = max(0, earliest - snippet_radius) - end = min(len(body), earliest + snippet_radius) - snippet = body[start:end] - # Collapse all whitespace runs (incl. internal newlines) for a - # single-line snippet — MCP renders text payloads inline. - snippet = " ".join(snippet.split()) - prefix = "…" if start > 0 else "" - suffix = "…" if end < len(body) else "" - return f"{prefix}{snippet}{suffix}" + sections: list[DocSection] = [] + preamble = body[: matches[0].start()].strip() + if preamble: + sections.append( + DocSection( + title="Overview", + slug="overview", + level=1, + content=preamble, + ) + ) + + for index, match in enumerate(matches): + start = match.start() + end = matches[index + 1].start() if index + 1 < len(matches) else len(body) + title = _clean_heading_text(match.group(2)) + sections.append( + DocSection( + title=title or "Section", + slug=_slugify(title or "section"), + level=len(match.group(1)), + content=body[start:end].strip(), + ) + ) + return tuple(sections) -def _score_page( - rel_path: str, - title: str, - body: str, - terms: list[str], -) -> int: - """Weighted keyword score for a single docs page. +def _tokenize_text(text: str) -> list[str]: + return [ + token + for token in _TOKEN_RE.findall(text.lower()) + if len(token) >= 2 and token not in _STOPWORDS + ] - Title/path matches outweigh body matches because they encode the - page's purpose, not just incidental mentions. Each query term - contributes independently — a page matching all terms ranks above - one matching a single term many times. - """ - if not terms: - return 0 - score = 0 - path_lower = rel_path.lower() - title_lower = title.lower() - body_lower = body.lower() - for term in terms: - path_hits = path_lower.count(term) - title_hits = title_lower.count(term) - body_hits = body_lower.count(term) - if path_hits == 0 and title_hits == 0 and body_hits == 0: - # Penalize pages that miss any query term — they probably - # aren't what the caller wants. + +def _tokenize_query(query: str) -> list[str]: + """Split a user query into lowercased keyword terms.""" + seen: set[str] = set() + terms: list[str] = [] + for token in _TOKEN_RE.findall(query.lower()): + if len(token) < 2 or token in _STOPWORDS or token in seen: continue - # Diminishing returns past a few hits per term: 1 dominant page - # shouldn't outweigh a page that hits every term. The cap is - # deliberately set so ``title_weight (5)`` strictly exceeds - # ``body_cap (4) × body_weight (1)`` — a page whose TITLE is the - # term must outrank a page that merely mentions it repeatedly. - body_hits = min(body_hits, 4) - score += path_hits * 8 + title_hits * 5 + body_hits + seen.add(token) + terms.append(token) + return terms + + +def _resolve_doc_file(root: Path, route_path: str) -> Path | None: + candidates = ( + root / f"{route_path}.mdx", + root / f"{route_path}.md", + root / route_path / "index.mdx", + root / route_path / "index.md", + ) + for candidate in candidates: + if candidate.is_file(): + return candidate + return None + + +def _build_doc_page( + root: Path, + route_path: str, + *, + breadcrumb: tuple[str, ...], + order: int, +) -> DocPage | None: + file_path = _resolve_doc_file(root, route_path) + if file_path is None: + return None + try: + contents = file_path.read_text(encoding="utf-8") + except (OSError, UnicodeDecodeError): + return None + + frontmatter, body = _split_frontmatter(contents) + fallback = route_path.rsplit("/", 1)[-1].replace("-", " ").title() + title = _extract_page_title(contents, fallback=fallback) + description = _normalize_text(frontmatter.get("description")) + llm_hint = _normalize_text(frontmatter.get("llm_hint")) + aliases = _normalize_aliases(frontmatter.get("aliases")) + content = body.strip() + + return DocPage( + path=route_path, + file_path=file_path.relative_to(root).as_posix(), + title=title, + description=description, + llm_hint=llm_hint, + aliases=aliases, + breadcrumb=breadcrumb, + content=content, + sections=_extract_sections(content), + order=order, + ) + + +def _score_counter(counter: Counter[str], term: str, *, weight: int, cap: int) -> int: + return min(counter.get(term, 0), cap) * weight + + +def _normalized_phrase(text: str) -> str: + return " ".join(_tokenize_text(text)) + + +def _score_section(section: DocSection, terms: list[str]) -> int: + title_counts = Counter(_tokenize_text(section.title)) + body_counts = Counter(_tokenize_text(section.content)) + score = 0 + matched_terms = 0 + for term in terms: + term_score = _score_counter( + title_counts, term, weight=7, cap=2 + ) + _score_counter(body_counts, term, weight=1, cap=4) + if term_score: + matched_terms += 1 + score += term_score + score += matched_terms * 4 + + phrase = " ".join(terms) + if phrase and phrase in _normalized_phrase(section.content): + score += 6 return score -def _docs_url_for(rel_path: str) -> str: - """Build the public docs URL for a relative on-disk path.""" - # Strip the extension and `index` so `getting-started/index.mdx` - # maps to `/getting-started`, matching Mintlify's routing. - no_ext = re.sub(r"\.(mdx|md)$", "", rel_path, flags=re.IGNORECASE) - if no_ext.endswith("/index"): - no_ext = no_ext[: -len("/index")] - return f"{DOCS_SITE_BASE_URL}/{no_ext}".rstrip("/") +def _score_page(page: DocPage, terms: list[str]) -> tuple[int, DocSection | None]: + if not terms: + return 0, None + + path_counts = Counter(_tokenize_text(page.path)) + title_counts = Counter(_tokenize_text(page.title)) + breadcrumb_counts = Counter(_tokenize_text(" ".join(page.breadcrumb))) + hint_counts = Counter(_tokenize_text(page.routing_hint())) + alias_counts = Counter(_tokenize_text(" ".join(page.aliases))) + + score = 0 + matched_terms = 0 + for term in terms: + term_score = ( + _score_counter(path_counts, term, weight=6, cap=3) + + _score_counter(title_counts, term, weight=10, cap=2) + + _score_counter(breadcrumb_counts, term, weight=4, cap=2) + + _score_counter(hint_counts, term, weight=7, cap=3) + + _score_counter(alias_counts, term, weight=7, cap=3) + ) + if term_score: + matched_terms += 1 + score += term_score + + best_section = None + best_section_score = 0 + for section in page.sections: + section_score = _score_section(section, terms) + if section_score > best_section_score: + best_section = section + best_section_score = section_score + + if score == 0 and best_section_score == 0: + return 0, None + + score += matched_terms * 8 + best_section_score + + phrase = " ".join(terms) + if phrase: + if phrase in _normalized_phrase(page.title): + score += 12 + elif phrase in _normalized_phrase(page.routing_hint()): + score += 8 + elif phrase in _normalized_phrase(page.path): + score += 8 + elif best_section is not None and phrase in _normalized_phrase( + best_section.content + ): + score += 4 + + return score, best_section + + +def _set_descendant_counts( + sections_by_path: dict[str, NavSection], + section_path: str, +) -> int: + section = sections_by_path[section_path] + page_count = 0 + for child_kind, child_path in section.children: + if child_kind == "page": + page_count += 1 + else: + page_count += _set_descendant_counts(sections_by_path, child_path) + sections_by_path[section_path] = replace(section, descendant_page_count=page_count) + return page_count + + +@lru_cache(maxsize=1) +def _docs_index() -> DocsIndex: + root = _resolve_docs_root() + if root is None: + return DocsIndex(pages_by_path={}, sections_by_path={}) + + try: + docs_config = json.loads((root / "docs.json").read_text(encoding="utf-8")) + except (OSError, UnicodeDecodeError, json.JSONDecodeError): + return DocsIndex(pages_by_path={}, sections_by_path={}) + + pages_by_path: dict[str, DocPage] = {} + sections_by_path: dict[str, NavSection] = {} + page_order = 0 + + def ensure_unique_section_path(base_path: str) -> str: + if base_path not in sections_by_path: + return base_path + suffix = 2 + while f"{base_path}-{suffix}" in sections_by_path: + suffix += 1 + return f"{base_path}-{suffix}" + + def walk_pages( + items: list[Any], + *, + section_path: str, + section_title: str, + ancestor_breadcrumb: tuple[str, ...], + ) -> None: + nonlocal page_order + children: list[tuple[str, str]] = [] + page_breadcrumb = ancestor_breadcrumb + (section_title,) + + for item in items: + if isinstance(item, str): + route_path = item.strip("/") + if not route_path: + continue + if route_path not in pages_by_path: + page = _build_doc_page( + root, + route_path, + breadcrumb=page_breadcrumb, + order=page_order, + ) + if page is not None: + pages_by_path[route_path] = page + page_order += 1 + if route_path in pages_by_path: + children.append(("page", route_path)) + continue + + if not isinstance(item, dict): + continue + group_title = str(item.get("group", "")).strip() + nested_pages = item.get("pages") + if not group_title or not isinstance(nested_pages, list): + continue + + child_path = ensure_unique_section_path( + f"{section_path}/{_slugify(group_title)}" + ) + walk_pages( + nested_pages, + section_path=child_path, + section_title=group_title, + ancestor_breadcrumb=page_breadcrumb, + ) + children.append(("section", child_path)) + + sections_by_path[section_path] = NavSection( + path=section_path, + title=section_title, + breadcrumb=ancestor_breadcrumb, + children=tuple(children), + ) + + root_children: list[tuple[str, str]] = [] + tabs = docs_config.get("navigation", {}).get("tabs", []) + for tab in tabs: + if not isinstance(tab, dict): + continue + tab_title = str(tab.get("tab", "")).strip() or "Docs" + for group in tab.get("groups", []): + if not isinstance(group, dict): + continue + group_title = str(group.get("group", "")).strip() + group_pages = group.get("pages") + if not group_title or not isinstance(group_pages, list): + continue + top_level_path = ensure_unique_section_path( + f"{_slugify(tab_title)}/{_slugify(group_title)}" + ) + walk_pages( + group_pages, + section_path=top_level_path, + section_title=group_title, + ancestor_breadcrumb=(tab_title,), + ) + root_children.append(("section", top_level_path)) + + sections_by_path[_ROOT_SECTION_PATH] = NavSection( + path=_ROOT_SECTION_PATH, + title="Docs", + breadcrumb=(), + children=tuple(root_children), + ) + _set_descendant_counts(sections_by_path, _ROOT_SECTION_PATH) + + return DocsIndex(pages_by_path=pages_by_path, sections_by_path=sections_by_path) + + +def _get_page_or_404(path: str) -> DocPage: + page = _docs_index().pages_by_path.get(path.strip("/")) + if page is None: + raise HTTPException(status_code=404, detail=f"Unknown docs page: {path!r}") + return page + + +def _find_section(page: DocPage, section: str) -> DocSection | None: + target = section.strip().lower() + for candidate in page.sections: + if candidate.slug.lower() == target or candidate.title.lower() == target: + return candidate + return None + + +def _expand_nav_entries( + index: DocsIndex, + section_path: str, + depth: int, +) -> list[dict]: + section = index.sections_by_path[section_path] + results: list[dict] = [] + for child_kind, child_path in section.children: + if child_kind == "section": + child_section = index.sections_by_path[child_path] + results.append(child_section.to_mcp_dict()) + if depth > 1: + results.extend(_expand_nav_entries(index, child_path, depth - 1)) + else: + results.append(index.pages_by_path[child_path].to_catalog_dict()) + return results @traced_tool -async def search_docs(query: str, limit: int = 10) -> list[dict]: - """Search the Dograh documentation by keyword and return ranked pages. +async def list_docs(path: str | None = None, depth: int = 1) -> list[dict]: + """Browse the Dograh docs hierarchy before reading a page in full. - Use this when the caller asks "how do I configure X" / "where are the docs for Y" / - "what does Dograh say about Z" — anything that should land on a docs page - rather than a workspace resource. For workspace data (agents, recordings, - credentials), use ``list_workflows`` / ``list_recordings`` / ``list_credentials`` - instead. - - Args: - query: Free-form keywords (e.g. "TURN server", "elevenlabs voice"). - Tokenized on non-alphanumeric characters; terms shorter than - 2 characters are dropped. - limit: Max pages to return. Capped at 25 regardless of input; - default 10 keeps the payload small enough to inline in MCP. - - Returns: - Up to ``limit`` results, sorted by descending relevance score. - Each entry has: - * ``path`` — repo-relative path (e.g. ``configurations/voice.mdx``) - * ``url`` — public docs URL (https://docs.dograh.com/...) - * ``title`` — page title (from Mintlify frontmatter when present) - * ``score`` — opaque integer relevance score - * ``snippet`` — ~240-char excerpt around the first term hit + ``path`` addresses navigation sections exposed by this tool. Page paths + returned by ``search_docs`` and ``read_doc`` are the published docs routes + instead, for example ``voice-agent/tools/mcp-tool``. + """ + await authenticate_mcp_request() + + if depth < 1 or depth > DOCS_LIST_MAX_DEPTH: + raise ValueError(f"`depth` must be between 1 and {DOCS_LIST_MAX_DEPTH}.") + + index = _docs_index() + if not index.sections_by_path: + return [] + + if path is None: + return _expand_nav_entries(index, _ROOT_SECTION_PATH, depth) + + normalized = path.strip("/") + if normalized in index.sections_by_path: + return _expand_nav_entries(index, normalized, depth) + if normalized in index.pages_by_path: + return [index.pages_by_path[normalized].to_catalog_dict()] + + raise HTTPException(status_code=404, detail=f"Unknown docs section: {path!r}") + + +@traced_tool +async def read_doc(path: str, section: str | None = None) -> dict: + """Read one docs page after you have narrowed to a likely match.""" + await authenticate_mcp_request() + + if not isinstance(path, str) or not path.strip(): + raise ValueError("`path` must be a non-empty string.") + + page = _get_page_or_404(path) + active_section = None + if section is not None: + active_section = _find_section(page, section) + if active_section is None: + raise HTTPException( + status_code=404, + detail=f"Unknown section {section!r} for docs page {path!r}", + ) + return page.to_read_dict(section=active_section) + + +@traced_tool +async def search_docs(query: str, limit: int = 5) -> list[dict]: + """Search the Dograh documentation and return a lean ranked shortlist. + + Use this first for keyword or acronym lookup. Once the right page looks + likely, call ``read_doc(path)`` instead of reasoning from summaries alone. """ - # Authentication is consistent with the rest of the MCP tools and - # routes through the same rate-limiting path, even though docs are - # not org-scoped data. await authenticate_mcp_request() if not isinstance(query, str) or not query.strip(): - raise ValueError("query must be a non-empty string.") - - try: - effective_limit = int(limit) - except (TypeError, ValueError) as exc: - raise ValueError("limit must be an integer.") from exc - if effective_limit < 1: - raise ValueError("limit must be at least 1.") - effective_limit = min(effective_limit, DOCS_SEARCH_MAX_LIMIT) + raise ValueError("`query` must be a non-empty string.") + if limit < 1: + raise ValueError("`limit` must be at least 1.") terms = _tokenize_query(query) if not terms: - # The caller passed something like punctuation-only or only - # single-char tokens — surface an actionable error rather than - # silently returning everything. raise ValueError( - "query must contain at least one keyword of 2+ alphanumeric characters." + "`query` must contain at least one non-stopword alphanumeric term." ) - corpus = _docs_corpus() - if not corpus: - # Tool is registered but docs aren't on disk — return empty - # rather than 500ing so the caller can degrade gracefully. + index = _docs_index() + if not index.pages_by_path: return [] - scored: list[tuple[int, str, str, str]] = [] - for rel_path, contents in corpus: - title = _extract_page_title(contents, fallback=rel_path) - body = _strip_frontmatter(contents) - score = _score_page(rel_path, title, body, terms) + capped_limit = min(limit, DOCS_SEARCH_MAX_LIMIT) + ranked: list[tuple[int, int, DocPage, DocSection | None]] = [] + for page in index.pages_by_path.values(): + score, best_section = _score_page(page, terms) if score <= 0: continue - scored.append((score, rel_path, title, body)) + ranked.append((score, page.order, page, best_section)) - scored.sort(key=lambda item: (-item[0], item[1])) - - results: list[dict] = [] - for score, rel_path, title, body in scored[:effective_limit]: - results.append( - { - "path": rel_path, - "url": _docs_url_for(rel_path), - "title": title, - "score": score, - "snippet": _build_snippet(body, terms), - } - ) - return results + ranked.sort(key=lambda item: (-item[0], item[1], item[2].path)) + return [ + page.to_catalog_dict(section=best_section) + for _, _, page, best_section in ranked[:capped_limit] + ] diff --git a/api/services/workflow/workflow_graph.py b/api/services/workflow/workflow_graph.py index ccb8deb..a626815 100644 --- a/api/services/workflow/workflow_graph.py +++ b/api/services/workflow/workflow_graph.py @@ -1,6 +1,6 @@ import re from collections import Counter -from typing import Any, Dict, List, Set +from typing import Dict, List, Set from api.services.workflow.dto import EdgeDataDTO, NodeType, ReactFlowDTO from api.services.workflow.errors import ItemKind, WorkflowError diff --git a/api/tests/test_mcp_docs_search.py b/api/tests/test_mcp_docs_search.py index 8b12571..5019aeb 100644 --- a/api/tests/test_mcp_docs_search.py +++ b/api/tests/test_mcp_docs_search.py @@ -1,14 +1,4 @@ -"""Unit tests for the `search_docs` MCP tool. - -The tool reads the docs corpus from disk via ``_resolve_docs_root`` and -caches it with ``functools.lru_cache``. These tests point the cache at -a synthetic corpus per-test so the assertions don't depend on the real -docs tree (which evolves) and the LRU cache doesn't leak state. - -`authenticate_mcp_request` is mocked so the tests don't need a live DB -or a valid API key — mirroring the pattern in -``test_mcp_save_workflow.py``. -""" +"""Unit tests for the MCP docs discovery tools.""" from __future__ import annotations @@ -17,71 +7,152 @@ from pathlib import Path from unittest.mock import AsyncMock, patch import pytest +from fastapi import HTTPException from api.mcp_server.tools import docs_search as docs_search_module from api.mcp_server.tools.docs_search import ( - _docs_url_for, + _docs_index, _extract_page_title, _resolve_docs_root, _score_page, _strip_frontmatter, _tokenize_query, + list_docs, + read_doc, search_docs, ) -# ─── Fixtures ──────────────────────────────────────────────────────────── +def _clear_docs_caches() -> None: + docs_search_module._docs_index.cache_clear() @pytest.fixture def fake_docs_root(tmp_path: Path) -> Path: - """Build a minimal docs tree on disk and point the tool at it.""" docs_root = tmp_path / "docs" docs_root.mkdir() - (docs_root / "configurations").mkdir() - (docs_root / "configurations" / "voice.mdx").write_text( + (docs_root / "getting-started").mkdir() + (docs_root / "getting-started" / "index.mdx").write_text( "---\n" - 'title: "Voice"\n' + 'title: "Getting started"\n' + 'description: "Start using Dograh."\n' "---\n\n" - "# Voice configuration\n\n" - "Dograh supports ElevenLabs and Cartesia TTS providers.\n" - "Configure the ElevenLabs voice_id in your workspace settings.\n", + "# Getting started\n\n" + "Welcome to Dograh.\n", encoding="utf-8", ) - (docs_root / "configurations" / "transcriber.mdx").write_text( + + (docs_root / "voice-agent").mkdir() + (docs_root / "voice-agent" / "introduction.mdx").write_text( "---\n" - 'title: "Transcriber"\n' + 'title: "Voice Agent Builder"\n' + 'description: "Build conversational workflows."\n' "---\n\n" - "# Speech-to-text\n\nDeepgram is the default transcriber.\n", + "# Voice Agent Builder\n\n" + "Build workflows with nodes and tools.\n", + encoding="utf-8", + ) + + (docs_root / "voice-agent" / "tools").mkdir() + (docs_root / "voice-agent" / "tools" / "mcp-tool.mdx").write_text( + "---\n" + 'title: "MCP Tool"\n' + 'description: "Connect external MCP servers."\n' + 'llm_hint: "Use for MCP server setup, remote tools, or model context protocol questions."\n' + "aliases:\n" + ' - "model context protocol"\n' + "---\n\n" + "# MCP Tool\n\n" + "Connect an external MCP server to your voice agent.\n\n" + "## Authentication\n\n" + "Provide the MCP endpoint URL and headers.\n", encoding="utf-8", ) (docs_root / "deployment").mkdir() - (docs_root / "deployment" / "turn-server.mdx").write_text( + (docs_root / "deployment" / "docker.mdx").write_text( "---\n" - 'title: "TURN server setup"\n' + 'title: "Docker"\n' + 'description: "Deploy Dograh with Docker."\n' + 'llm_hint: "Use for Docker deployment, local setup, remote setup, TURN server, coturn, or WebRTC connectivity questions."\n' + "aliases:\n" + ' - "coturn"\n' + ' - "turn server"\n' "---\n\n" - "# TURN server\n\n" - "WebRTC requires a TURN server for NAT traversal. Coturn is the " - "recommended choice for self-hosted deployments.\n", + "# Docker\n\n" + "Run Dograh with Docker.\n\n" + "## Troubleshooting WebRTC Connectivity\n\n" + "If audio fails or ICE fails, configure a TURN server. Coturn is the recommended choice.\n", encoding="utf-8", ) - # A non-doc file that must be ignored by the corpus loader. - (docs_root / "docs.json").write_text('{"name":"Dograh"}', encoding="utf-8") + # Hidden/orphaned docs page: present on disk but not in docs.json, so it + # must not be indexed by the MCP tools. + (docs_root / "internal-only.mdx").write_text( + "---\n" + 'title: "Internal TURN Notes"\n' + "---\n\n" + "# Internal TURN Notes\n\n" + "This page mentions zyxinternalturntoken but is not user-facing.\n", + encoding="utf-8", + ) - # Reset the LRU cache and pin the resolver to our tmp tree. - docs_search_module._docs_corpus.cache_clear() + (docs_root / "AGENTS.md").write_text("# Internal instructions\n", encoding="utf-8") + + (docs_root / "docs.json").write_text( + """{ + "navigation": { + "tabs": [ + { + "tab": "Guides", + "groups": [ + { + "group": "Getting started", + "pages": [ + "getting-started/index" + ] + }, + { + "group": "Voice Agent Builder", + "pages": [ + "voice-agent/introduction", + { + "group": "Tools", + "pages": [ + "voice-agent/tools/mcp-tool" + ] + } + ] + } + ] + }, + { + "tab": "Developer", + "groups": [ + { + "group": "Deployment", + "pages": [ + "deployment/docker" + ] + } + ] + } + ] + } +} +""", + encoding="utf-8", + ) + + _clear_docs_caches() with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(docs_root)}): yield docs_root - docs_search_module._docs_corpus.cache_clear() + _clear_docs_caches() @pytest.fixture def authed_user(): - """Stub ``authenticate_mcp_request`` so tests skip the API-key path.""" - class _FakeUser: selected_organization_id = 1 id = 42 @@ -93,18 +164,8 @@ def authed_user(): yield _FakeUser() -# ─── Pure helpers ──────────────────────────────────────────────────────── - - -def test_tokenize_query_strips_short_and_punct_terms(): - """Punctuation and 1-char tokens must not bleed into the scorer. - - A trailing `?` or stray `a` would otherwise match nearly every page - and flatten the relevance ranking. - """ - assert _tokenize_query("How do I configure a TURN server?") == [ - "how", - "do", +def test_tokenize_query_dedupes_and_drops_stopwords(): + assert _tokenize_query("How do I configure a TURN server TURN?") == [ "configure", "turn", "server", @@ -121,155 +182,92 @@ def test_strip_frontmatter_removes_yaml_block(): assert _strip_frontmatter(body).startswith("# Heading") -def test_strip_frontmatter_passes_through_when_missing(): - body = "# Just a heading\nbody text\n" - assert _strip_frontmatter(body) == body - - def test_extract_page_title_prefers_frontmatter(): body = '---\ntitle: "Front Title"\n---\n\n# Heading Title\n' assert _extract_page_title(body, fallback="x.mdx") == "Front Title" def test_extract_page_title_falls_back_to_first_heading(): - """When frontmatter is missing the first ATX heading is the next best - signal — better than just returning the filename, which often is - a slug not a human-readable title.""" body = "# Heading Title\nbody\n" assert _extract_page_title(body, fallback="x.mdx") == "Heading Title" -def test_extract_page_title_falls_back_to_filename_when_nothing_matches(): - body = "plain prose with no heading or frontmatter" - assert _extract_page_title(body, fallback="x.mdx") == "x.mdx" - - -def test_docs_url_for_strips_extension_and_index(): - assert ( - _docs_url_for("configurations/voice.mdx") - == "https://docs.dograh.com/configurations/voice" - ) - assert ( - _docs_url_for("getting-started/index.mdx") - == "https://docs.dograh.com/getting-started" - ) - - -def test_score_page_weights_title_above_body(): - """Title hits must outweigh body hits — otherwise a long page that - incidentally mentions the term many times outranks the page whose - purpose IS the term.""" - title_only = _score_page( - rel_path="other.mdx", title="TURN server", body="unrelated text", terms=["turn"] - ) - body_only = _score_page( - rel_path="other.mdx", - title="Unrelated", - body="turn turn turn turn turn", - terms=["turn"], - ) - assert title_only > body_only - - -def test_score_page_returns_zero_when_no_terms_match(): - assert ( - _score_page( - rel_path="x.mdx", title="X", body="hello world", terms=["nonexistent"] - ) - == 0 +def test_score_page_uses_llm_hint_and_aliases(): + page = docs_search_module.DocPage( + path="deployment/docker", + file_path="deployment/docker.mdx", + title="Docker", + description="Deploy Dograh with Docker.", + llm_hint="Use for TURN server and coturn setup.", + aliases=("coturn",), + breadcrumb=("Developer", "Deployment"), + content="Docker deployment.", + sections=( + docs_search_module.DocSection( + title="Troubleshooting WebRTC Connectivity", + slug="troubleshooting-webrtc-connectivity", + level=2, + content="Configure a TURN server with coturn.", + ), + ), + order=0, ) + score, section = _score_page(page, ["coturn"]) + assert score > 0 + assert section is not None + assert section.slug == "troubleshooting-webrtc-connectivity" def test_resolve_docs_root_honors_env_override(tmp_path: Path): docs = tmp_path / "custom_docs" docs.mkdir() + (docs / "docs.json").write_text("{}", encoding="utf-8") with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(docs)}): assert _resolve_docs_root() == docs.resolve() -def test_resolve_docs_root_ignores_nonexistent_env_value(tmp_path: Path): - """A bogus env value must not crash the tool — fall back to discovery - (the real ``docs/`` in the repo) instead.""" - with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(tmp_path / "nope")}): - # Walk-up discovery should land somewhere (the repo's actual docs) - # but we don't assert the exact path because it depends on where - # the tests are run; we just assert no crash and either None or a dir. - resolved = _resolve_docs_root() - assert resolved is None or resolved.is_dir() - - -# ─── End-to-end tool behaviour ─────────────────────────────────────────── - - @pytest.mark.asyncio -async def test_search_docs_ranks_turn_setup_first_for_turn_query( +async def test_search_docs_ranks_turn_doc_and_uses_route_path( fake_docs_root, authed_user ): - """The page whose title and body are both about TURN must outrank - incidental mentions of related words on other pages.""" - results = await search_docs("How do I set up a TURN server?") - assert results, "expected at least one result" - assert results[0]["path"] == "deployment/turn-server.mdx" - assert results[0]["url"] == "https://docs.dograh.com/deployment/turn-server" - assert "TURN server" in results[0]["title"] - assert "TURN" in results[0]["snippet"] or "turn" in results[0]["snippet"].lower() + results = await search_docs("How do I configure coturn for WebRTC?") + assert results + assert results[0]["path"] == "deployment/docker" + assert results[0]["section_slug"] == "troubleshooting-webrtc-connectivity" + assert "TURN server" in results[0]["llm_hint"] + assert "snippet" not in results[0] + assert "score" not in results[0] + assert "url" not in results[0] @pytest.mark.asyncio -async def test_search_docs_excludes_non_doc_files(fake_docs_root, authed_user): - """``docs.json`` must not appear — the corpus loader filters to - .mdx/.md only.""" - results = await search_docs("Dograh") - paths = [r["path"] for r in results] - assert "docs.json" not in paths - - -@pytest.mark.asyncio -async def test_search_docs_returns_empty_when_no_match(fake_docs_root, authed_user): - results = await search_docs("xyzzy unrelated zzz") +async def test_search_docs_indexes_only_docs_json_pages(fake_docs_root, authed_user): + results = await search_docs("zyxinternalturntoken") assert results == [] @pytest.mark.asyncio async def test_search_docs_respects_limit(fake_docs_root, authed_user): - """``limit=1`` must collapse the result list even if multiple pages - match.""" - results = await search_docs("Dograh", limit=1) + results = await search_docs("dograh", limit=1) assert len(results) == 1 @pytest.mark.asyncio -async def test_search_docs_clamps_limit_to_hard_cap(fake_docs_root, authed_user): - """A pathological large limit must be clamped to - ``DOCS_SEARCH_MAX_LIMIT`` (=25) so the payload stays bounded.""" - # Drop in extra docs so there's headroom to verify the clamp. - for i in range(30): - (fake_docs_root / f"extra-{i}.mdx").write_text( - f"# Page {i}\nThis Dograh page covers configurations topic {i}.\n", - encoding="utf-8", - ) - docs_search_module._docs_corpus.cache_clear() - results = await search_docs("Dograh", limit=999) - assert len(results) <= 25 +async def test_search_docs_returns_empty_when_no_match(fake_docs_root, authed_user): + assert await search_docs("xyzzy unrelated zzz") == [] @pytest.mark.asyncio async def test_search_docs_returns_empty_when_no_corpus( tmp_path, authed_user, monkeypatch ): - """If the docs directory doesn't exist on disk, the tool must - degrade to an empty list rather than raising — Docker images and - dev checkouts can disagree on layout.""" nonexistent = tmp_path / "no-docs-here" monkeypatch.setenv("DOGRAH_DOCS_PATH", str(nonexistent)) - # Also block the walk-up fallback by pointing the resolver at a - # tmp path with no `docs/` ancestor. - docs_search_module._docs_corpus.cache_clear() + _clear_docs_caches() with patch( "api.mcp_server.tools.docs_search._resolve_docs_root", return_value=None ): - results = await search_docs("anything") - assert results == [] + assert await search_docs("anything") == [] @pytest.mark.asyncio @@ -279,16 +277,83 @@ async def test_search_docs_rejects_empty_query(fake_docs_root, authed_user): @pytest.mark.asyncio -async def test_search_docs_rejects_query_with_no_real_terms( +async def test_search_docs_rejects_query_with_only_stopwords( fake_docs_root, authed_user ): - """A query like `"???"` tokenizes to nothing — surface an actionable - error rather than silently returning every page.""" - with pytest.raises(ValueError, match="2\\+ alphanumeric"): - await search_docs("?? // !!") + with pytest.raises(ValueError, match="non-stopword"): + await search_docs("how do I") @pytest.mark.asyncio async def test_search_docs_rejects_zero_limit(fake_docs_root, authed_user): with pytest.raises(ValueError, match="at least 1"): await search_docs("Dograh", limit=0) + + +@pytest.mark.asyncio +async def test_list_docs_returns_top_level_sections(fake_docs_root, authed_user): + results = await list_docs() + assert results[0]["kind"] == "section" + assert results[0]["path"] == "guides/getting-started" + assert results[1]["path"] == "guides/voice-agent-builder" + + +@pytest.mark.asyncio +async def test_list_docs_depth_expands_children(fake_docs_root, authed_user): + results = await list_docs("guides/voice-agent-builder", depth=2) + paths = [item["path"] for item in results] + assert "voice-agent/introduction" in paths + assert "guides/voice-agent-builder/tools" in paths + assert "voice-agent/tools/mcp-tool" in paths + + +@pytest.mark.asyncio +async def test_list_docs_rejects_unknown_section(fake_docs_root, authed_user): + with pytest.raises(HTTPException, match="Unknown docs section"): + await list_docs("nope") + + +@pytest.mark.asyncio +async def test_read_doc_returns_full_page_and_sections(fake_docs_root, authed_user): + result = await read_doc("deployment/docker") + assert result["path"] == "deployment/docker" + assert result["title"] == "Docker" + assert "url" not in result + section_slugs = [section["slug"] for section in result["sections"]] + assert "docker" in section_slugs + assert "troubleshooting-webrtc-connectivity" in section_slugs + assert "Coturn" in result["content"] or "coturn" in result["content"].lower() + + +@pytest.mark.asyncio +async def test_read_doc_can_target_section(fake_docs_root, authed_user): + result = await read_doc( + "deployment/docker", + section="troubleshooting-webrtc-connectivity", + ) + assert result["section_slug"] == "troubleshooting-webrtc-connectivity" + assert "ICE fails" in result["content"] or "TURN server" in result["content"] + assert "Run Dograh with Docker." not in result["content"] + + +@pytest.mark.asyncio +async def test_read_doc_rejects_unknown_page(fake_docs_root, authed_user): + with pytest.raises(HTTPException, match="Unknown docs page"): + await read_doc("missing/page") + + +@pytest.mark.asyncio +async def test_read_doc_rejects_unknown_section(fake_docs_root, authed_user): + with pytest.raises(HTTPException, match="Unknown section"): + await read_doc("deployment/docker", section="missing-section") + + +def test_docs_index_uses_docs_json_navigation(fake_docs_root): + index = _docs_index() + assert "internal-only" not in index.pages_by_path + assert "guides/voice-agent-builder/tools" in index.sections_by_path + assert index.pages_by_path["voice-agent/tools/mcp-tool"].breadcrumb == ( + "Guides", + "Voice Agent Builder", + "Tools", + )