feat: improve docs search

2026-07-25 12:01:04 +02:00 · 2026-05-20 18:18:05 +05:30 · 2026-05-20 18:18:05 +05:30 · 5c638070e0
commit 5c638070e0
parent 4618af20b8
5 changed files with 876 additions and 404 deletions
--- a/api/mcp_server/instructions.py
+++ b/api/mcp_server/instructions.py
@ -16,6 +16,11 @@ You build and edit Dograh voice-AI workflows by emitting TypeScript that uses th

 ## Call order

+### Reading documentation
+1. `search_docs(query)` — use first for keyword or acronym lookup when the user is asking how Dograh works or how to configure something.
+2. `read_doc(path)` — fetch the full page once one result looks likely. Prefer this over reasoning from search summaries alone.
+3. `list_docs(path=None, depth=1)` — use when the user wants to browse a topic area or when search terms are too vague. Returned section paths feed back into `list_docs`; returned page paths feed into `read_doc`.
+
 ### Editing an existing workflow
 1. `list_workflows` — locate the target workflow.
 2. `get_workflow_code(workflow_id)` — fetch the current source.
--- a/api/mcp_server/server.py
+++ b/api/mcp_server/server.py
@ -1,4 +1,5 @@
 from fastmcp import FastMCP
+from mcp.types import ToolAnnotations

 from api.mcp_server.instructions import DOGRAH_MCP_INSTRUCTIONS
 from api.mcp_server.tools.catalog import (
@ -8,7 +9,7 @@ from api.mcp_server.tools.catalog import (
    list_tools,
 )
 from api.mcp_server.tools.create_workflow import create_workflow
-from api.mcp_server.tools.docs_search import search_docs
+from api.mcp_server.tools.docs_search import list_docs, read_doc, search_docs
 from api.mcp_server.tools.get_workflow_code import get_workflow_code
 from api.mcp_server.tools.node_types import get_node_type, list_node_types
 from api.mcp_server.tools.save_workflow import save_workflow
@ -28,6 +29,15 @@ for _tool in (
    list_tools,
    list_workflows,
    save_workflow,
-    search_docs,
 ):
    mcp.tool(_tool)
+
+_DOCS_TOOL_ANNOTATIONS = ToolAnnotations(
+    readOnlyHint=True,
+    idempotentHint=True,
+    destructiveHint=False,
+    openWorldHint=False,
+)
+
+for _tool in (list_docs, read_doc, search_docs):
+    mcp.tool(_tool, annotations=_DOCS_TOOL_ANNOTATIONS)
--- a/api/mcp_server/tools/docs_search.py
+++ b/api/mcp_server/tools/docs_search.py
@ -1,312 +1,704 @@
-"""`search_docs` MCP tool — keyword search over the Mintlify docs tree.
+"""MCP docs discovery tools over the Mintlify docs tree.

-The docs are shipped into the API image (`COPY ./docs ./docs` in
-`api/Dockerfile`), so this tool works for both source/dev runs and
-Docker deployments. For source/dev runs we walk up from this file to
-locate the `docs/` directory; for Docker we land on `/app/docs`. An
-explicit `DOGRAH_DOCS_PATH` env var overrides discovery.
+The docs surface is intentionally split into three steps:

-The implementation is intentionally dependency-free: it does in-memory
-keyword scoring rather than building a vector index. The docs corpus is
-small (~100 .mdx files, ~140k LoC), so a per-call scan is well under
-50 ms and avoids needing an embedding backend, vector store, or
-background indexer for a tool that's called interactively from MCP.
+- ``list_docs`` for lightweight navigation over the published hierarchy
+- ``search_docs`` for keyword lookup across the visible docs catalog
+- ``read_doc`` for the full content of one chosen page (or one section)
+
+The runtime index is derived from ``docs/docs.json`` plus the referenced
+``.mdx``/``.md`` files. That keeps navigation, ordering, and visibility in
+sync with the published docs rather than indexing every file under ``docs/``.
 """

 from __future__ import annotations

+import json
 import os
 import re
+from collections import Counter
+from dataclasses import dataclass, replace
 from functools import lru_cache
 from pathlib import Path
+from typing import Any
+
+import yaml
+from fastapi import HTTPException

 from api.mcp_server.auth import authenticate_mcp_request
 from api.mcp_server.tracing import traced_tool

-# Public site for the rendered docs. Used to build a clickable URL per
-# result; agents can hand the URL back to the user even if the local
-# file isn't reachable.
-DOCS_SITE_BASE_URL = "https://docs.dograh.com"
-
-# Hard cap regardless of caller-supplied limit. Keeps the MCP response
-# payload bounded; Mintlify search APIs use a similar 10-25 ceiling.
 DOCS_SEARCH_MAX_LIMIT = 25
+DOCS_LIST_MAX_DEPTH = 3
+_ROOT_SECTION_PATH = "__root__"

-# Heading-detection regex. Matches ATX headings (`# `, `## `, etc.) but
-# not in-line `#` characters.
+_TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
+_FRONTMATTER_RE = re.compile(r"\A---\s*\n(.*?)\n---\s*\n?", re.DOTALL)
 _HEADING_RE = re.compile(r"^(#{1,6})\s+(.*?)\s*$", re.MULTILINE)
+_STOPWORDS = {
+    "a",
+    "an",
+    "and",
+    "are",
+    "at",
+    "be",
+    "by",
+    "can",
+    "do",
+    "for",
+    "from",
+    "how",
+    "i",
+    "if",
+    "in",
+    "is",
+    "it",
+    "me",
+    "my",
+    "of",
+    "on",
+    "or",
+    "the",
+    "to",
+    "what",
+    "when",
+    "where",
+    "with",
+    "you",
+    "your",
+}
+
+
+@dataclass(frozen=True)
+class DocSection:
+    title: str
+    slug: str
+    level: int
+    content: str
+
+
+@dataclass(frozen=True)
+class DocPage:
+    path: str
+    file_path: str
+    title: str
+    description: str
+    llm_hint: str
+    aliases: tuple[str, ...]
+    breadcrumb: tuple[str, ...]
+    content: str
+    sections: tuple[DocSection, ...]
+    order: int
+
+    def breadcrumb_text(self) -> str:
+        return " > ".join(self.breadcrumb)
+
+    def routing_hint(self) -> str:
+        return self.llm_hint or self.description
+
+    def to_catalog_dict(self, section: DocSection | None = None) -> dict:
+        data = {
+            "kind": "page",
+            "path": self.path,
+            "title": self.title,
+            "breadcrumb": self.breadcrumb_text(),
+            "llm_hint": self.routing_hint(),
+        }
+        if section is not None:
+            data["section_title"] = section.title
+            data["section_slug"] = section.slug
+        return _compact_dict(data)
+
+    def to_read_dict(self, section: DocSection | None = None) -> dict:
+        active_section = section
+        content = self.content
+        if active_section is not None:
+            content = active_section.content
+
+        return _compact_dict(
+            {
+                "path": self.path,
+                "title": self.title,
+                "breadcrumb": self.breadcrumb_text(),
+                "llm_hint": self.routing_hint(),
+                "section_title": active_section.title if active_section else None,
+                "section_slug": active_section.slug if active_section else None,
+                "content": content,
+                "sections": [
+                    {"title": sec.title, "slug": sec.slug}
+                    for sec in self.sections
+                    if sec.title and sec.slug
+                ],
+            }
+        )
+
+
+@dataclass(frozen=True)
+class NavSection:
+    path: str
+    title: str
+    breadcrumb: tuple[str, ...]
+    children: tuple[tuple[str, str], ...]
+    descendant_page_count: int = 0
+
+    def breadcrumb_text(self) -> str:
+        return " > ".join(self.breadcrumb)
+
+    def to_mcp_dict(self) -> dict:
+        hint = None
+        if self.descendant_page_count:
+            hint = f"Browse {self.descendant_page_count} docs in this section."
+        return _compact_dict(
+            {
+                "kind": "section",
+                "path": self.path,
+                "title": self.title,
+                "breadcrumb": self.breadcrumb_text(),
+                "llm_hint": hint,
+                "has_children": bool(self.children),
+                "child_count": len(self.children),
+                "page_count": self.descendant_page_count,
+            }
+        )
+
+
+@dataclass(frozen=True)
+class DocsIndex:
+    pages_by_path: dict[str, DocPage]
+    sections_by_path: dict[str, NavSection]
+
+
+def _compact_dict(data: dict[str, Any]) -> dict[str, Any]:
+    return {
+        key: value for key, value in data.items() if value not in (None, "", [], (), {})
+    }
+
+
+def _slugify(value: str) -> str:
+    slug = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
+    return slug or "section"
+
+
+def _coerce_docs_root(candidate: Path) -> Path | None:
+    candidate = candidate.expanduser().resolve()
+    if (candidate / "docs.json").is_file():
+        return candidate
+    nested = candidate / "docs"
+    if (nested / "docs.json").is_file():
+        return nested
+    return None


 def _resolve_docs_root() -> Path | None:
-    """Return the path to the on-disk docs tree, or None if not found.
-
-    Resolution order:
-    1. ``DOGRAH_DOCS_PATH`` env var (absolute path).
-    2. ``/app/docs`` — the location the API Dockerfile copies docs to.
-    3. Walk upward from this file looking for a sibling ``docs/`` dir
-       (covers source-checkout / dev runs).
-    """
+    """Return the path to the on-disk docs tree, or None if not found."""
    override = os.environ.get("DOGRAH_DOCS_PATH")
    if override:
-        candidate = Path(override).expanduser().resolve()
-        if candidate.is_dir():
-            return candidate
+        resolved = _coerce_docs_root(Path(override))
+        if resolved is not None:
+            return resolved

-    docker_default = Path("/app/docs")
-    if docker_default.is_dir():
+    docker_default = _coerce_docs_root(Path("/app/docs"))
+    if docker_default is not None:
        return docker_default

-    # Walk up from .../api/mcp_server/tools/docs_search.py looking for docs/.
    for parent in Path(__file__).resolve().parents:
-        candidate = parent / "docs"
-        if candidate.is_dir():
-            return candidate
+        resolved = _coerce_docs_root(parent / "docs")
+        if resolved is not None:
+            return resolved

    return None


-@lru_cache(maxsize=1)
-def _docs_corpus() -> tuple[tuple[str, str], ...]:
-    """Load the docs corpus once per process.
-
-    Returns a tuple of ``(relative_path, file_contents)`` pairs. The
-    docs tree is small and read-mostly at runtime, so caching the full
-    text in memory is cheaper than re-reading on every search.
-    Cache miss is intentional when ``DOGRAH_DOCS_PATH`` flips at
-    startup — for live edits, restart the process.
-    """
-    root = _resolve_docs_root()
-    if root is None:
-        return ()
-
-    pairs: list[tuple[str, str]] = []
-    for path in sorted(root.rglob("*")):
-        if not path.is_file():
-            continue
-        if path.suffix.lower() not in {".mdx", ".md"}:
-            continue
-        try:
-            contents = path.read_text(encoding="utf-8")
-        except (OSError, UnicodeDecodeError):
-            # Skip unreadable files rather than crashing the whole tool.
-            continue
-        rel = path.relative_to(root).as_posix()
-        pairs.append((rel, contents))
-    return tuple(pairs)
-
-
-def _tokenize_query(query: str) -> list[str]:
-    """Split a user query into lowercased keyword terms.
-
-    Empty strings and 1-char filler terms are dropped — they would
-    match almost every file and drown out the real signal.
-    """
-    terms = re.findall(r"[A-Za-z0-9_]+", query.lower())
-    return [term for term in terms if len(term) >= 2]
-
-
-def _extract_page_title(contents: str, fallback: str) -> str:
-    """Pull a human-readable title for a docs page.
-
-    Mintlify pages start with a YAML frontmatter block whose ``title``
-    is the most authoritative title; fall back to the first ATX heading
-    if frontmatter is missing or malformed; fall back to the filename
-    if no heading exists.
-    """
-    if contents.startswith("---"):
-        end = contents.find("---", 3)
-        if end != -1:
-            frontmatter = contents[3:end]
-            for line in frontmatter.splitlines():
-                line = line.strip()
-                if line.lower().startswith("title:"):
-                    value = line.split(":", 1)[1].strip()
-                    # Strip surrounding quotes if Mintlify wrote them.
-                    if (
-                        len(value) >= 2
-                        and value[0] == value[-1]
-                        and value[0] in ('"', "'")
-                    ):
-                        value = value[1:-1]
-                    if value:
-                        return value
-
-    match = _HEADING_RE.search(contents)
-    if match:
-        return match.group(2).strip()
-
-    return fallback
+def _split_frontmatter(contents: str) -> tuple[dict[str, Any], str]:
+    match = _FRONTMATTER_RE.match(contents)
+    if not match:
+        return {}, contents
+    try:
+        frontmatter = yaml.safe_load(match.group(1)) or {}
+    except yaml.YAMLError:
+        return {}, contents
+    if not isinstance(frontmatter, dict):
+        frontmatter = {}
+    return frontmatter, contents[match.end() :].lstrip("\n")


 def _strip_frontmatter(contents: str) -> str:
    """Drop the YAML frontmatter block from a docs page body."""
-    if not contents.startswith("---"):
-        return contents
-    end = contents.find("---", 3)
-    if end == -1:
-        return contents
-    return contents[end + 3 :].lstrip("\n")
+    return _split_frontmatter(contents)[1]


-def _build_snippet(body: str, terms: list[str], snippet_radius: int = 120) -> str:
-    """Return a ~240-char window around the first term hit in ``body``.
+def _clean_heading_text(raw: str) -> str:
+    text = re.sub(r"\s*\{#.*\}\s*$", "", raw.strip())
+    return " ".join(text.split())

-    The window is centered on the earliest match (whichever term comes
-    first wins) so the snippet shows context for the strongest signal,
-    not the lexicographically-first term. Leading/trailing newlines are
-    collapsed so the snippet renders cleanly through MCP's text payload.
-    """
-    body_lower = body.lower()
-    earliest = -1
-    for term in terms:
-        idx = body_lower.find(term)
-        if idx != -1 and (earliest == -1 or idx < earliest):
-            earliest = idx

-    if earliest == -1:
-        # No hit in body — the match must have come from the title or
-        # path, so just return the first line of body as orientation.
-        first_line = next(
-            (line.strip() for line in body.splitlines() if line.strip()),
-            "",
+def _extract_page_title(contents: str, fallback: str) -> str:
+    """Pull a human-readable title for a docs page."""
+    frontmatter, body = _split_frontmatter(contents)
+    title = frontmatter.get("title")
+    if isinstance(title, str) and title.strip():
+        return title.strip()
+
+    match = _HEADING_RE.search(body)
+    if match:
+        return _clean_heading_text(match.group(2))
+
+    return fallback
+
+
+def _normalize_text(value: Any) -> str:
+    if isinstance(value, str):
+        return " ".join(value.strip().split())
+    return ""
+
+
+def _normalize_aliases(value: Any) -> tuple[str, ...]:
+    if isinstance(value, str):
+        aliases = [value]
+    elif isinstance(value, list):
+        aliases = [item for item in value if isinstance(item, str)]
+    else:
+        aliases = []
+    return tuple(alias.strip() for alias in aliases if alias.strip())
+
+
+def _extract_sections(body: str) -> tuple[DocSection, ...]:
+    matches = list(_HEADING_RE.finditer(body))
+    stripped_body = body.strip()
+    if not matches:
+        if not stripped_body:
+            return ()
+        return (
+            DocSection(
+                title="Overview",
+                slug="overview",
+                level=1,
+                content=stripped_body,
+            ),
        )
-        return first_line[: snippet_radius * 2]

-    start = max(0, earliest - snippet_radius)
-    end = min(len(body), earliest + snippet_radius)
-    snippet = body[start:end]
-    # Collapse all whitespace runs (incl. internal newlines) for a
-    # single-line snippet — MCP renders text payloads inline.
-    snippet = " ".join(snippet.split())
-    prefix = "…" if start > 0 else ""
-    suffix = "…" if end < len(body) else ""
-    return f"{prefix}{snippet}{suffix}"
+    sections: list[DocSection] = []
+    preamble = body[: matches[0].start()].strip()
+    if preamble:
+        sections.append(
+            DocSection(
+                title="Overview",
+                slug="overview",
+                level=1,
+                content=preamble,
+            )
+        )
+
+    for index, match in enumerate(matches):
+        start = match.start()
+        end = matches[index + 1].start() if index + 1 < len(matches) else len(body)
+        title = _clean_heading_text(match.group(2))
+        sections.append(
+            DocSection(
+                title=title or "Section",
+                slug=_slugify(title or "section"),
+                level=len(match.group(1)),
+                content=body[start:end].strip(),
+            )
+        )
+    return tuple(sections)


-def _score_page(
-    rel_path: str,
-    title: str,
-    body: str,
-    terms: list[str],
-) -> int:
-    """Weighted keyword score for a single docs page.
+def _tokenize_text(text: str) -> list[str]:
+    return [
+        token
+        for token in _TOKEN_RE.findall(text.lower())
+        if len(token) >= 2 and token not in _STOPWORDS
+    ]

-    Title/path matches outweigh body matches because they encode the
-    page's purpose, not just incidental mentions. Each query term
-    contributes independently — a page matching all terms ranks above
-    one matching a single term many times.
-    """
-    if not terms:
-        return 0
-    score = 0
-    path_lower = rel_path.lower()
-    title_lower = title.lower()
-    body_lower = body.lower()
-    for term in terms:
-        path_hits = path_lower.count(term)
-        title_hits = title_lower.count(term)
-        body_hits = body_lower.count(term)
-        if path_hits == 0 and title_hits == 0 and body_hits == 0:
-            # Penalize pages that miss any query term — they probably
-            # aren't what the caller wants.
+
+def _tokenize_query(query: str) -> list[str]:
+    """Split a user query into lowercased keyword terms."""
+    seen: set[str] = set()
+    terms: list[str] = []
+    for token in _TOKEN_RE.findall(query.lower()):
+        if len(token) < 2 or token in _STOPWORDS or token in seen:
            continue
-        # Diminishing returns past a few hits per term: 1 dominant page
-        # shouldn't outweigh a page that hits every term. The cap is
-        # deliberately set so ``title_weight (5)`` strictly exceeds
-        # ``body_cap (4) × body_weight (1)`` — a page whose TITLE is the
-        # term must outrank a page that merely mentions it repeatedly.
-        body_hits = min(body_hits, 4)
-        score += path_hits * 8 + title_hits * 5 + body_hits
+        seen.add(token)
+        terms.append(token)
+    return terms
+
+
+def _resolve_doc_file(root: Path, route_path: str) -> Path | None:
+    candidates = (
+        root / f"{route_path}.mdx",
+        root / f"{route_path}.md",
+        root / route_path / "index.mdx",
+        root / route_path / "index.md",
+    )
+    for candidate in candidates:
+        if candidate.is_file():
+            return candidate
+    return None
+
+
+def _build_doc_page(
+    root: Path,
+    route_path: str,
+    *,
+    breadcrumb: tuple[str, ...],
+    order: int,
+) -> DocPage | None:
+    file_path = _resolve_doc_file(root, route_path)
+    if file_path is None:
+        return None
+    try:
+        contents = file_path.read_text(encoding="utf-8")
+    except (OSError, UnicodeDecodeError):
+        return None
+
+    frontmatter, body = _split_frontmatter(contents)
+    fallback = route_path.rsplit("/", 1)[-1].replace("-", " ").title()
+    title = _extract_page_title(contents, fallback=fallback)
+    description = _normalize_text(frontmatter.get("description"))
+    llm_hint = _normalize_text(frontmatter.get("llm_hint"))
+    aliases = _normalize_aliases(frontmatter.get("aliases"))
+    content = body.strip()
+
+    return DocPage(
+        path=route_path,
+        file_path=file_path.relative_to(root).as_posix(),
+        title=title,
+        description=description,
+        llm_hint=llm_hint,
+        aliases=aliases,
+        breadcrumb=breadcrumb,
+        content=content,
+        sections=_extract_sections(content),
+        order=order,
+    )
+
+
+def _score_counter(counter: Counter[str], term: str, *, weight: int, cap: int) -> int:
+    return min(counter.get(term, 0), cap) * weight
+
+
+def _normalized_phrase(text: str) -> str:
+    return " ".join(_tokenize_text(text))
+
+
+def _score_section(section: DocSection, terms: list[str]) -> int:
+    title_counts = Counter(_tokenize_text(section.title))
+    body_counts = Counter(_tokenize_text(section.content))
+    score = 0
+    matched_terms = 0
+    for term in terms:
+        term_score = _score_counter(
+            title_counts, term, weight=7, cap=2
+        ) + _score_counter(body_counts, term, weight=1, cap=4)
+        if term_score:
+            matched_terms += 1
+            score += term_score
+    score += matched_terms * 4
+
+    phrase = " ".join(terms)
+    if phrase and phrase in _normalized_phrase(section.content):
+        score += 6
    return score


-def _docs_url_for(rel_path: str) -> str:
-    """Build the public docs URL for a relative on-disk path."""
-    # Strip the extension and `index` so `getting-started/index.mdx`
-    # maps to `/getting-started`, matching Mintlify's routing.
-    no_ext = re.sub(r"\.(mdx|md)$", "", rel_path, flags=re.IGNORECASE)
-    if no_ext.endswith("/index"):
-        no_ext = no_ext[: -len("/index")]
-    return f"{DOCS_SITE_BASE_URL}/{no_ext}".rstrip("/")
+def _score_page(page: DocPage, terms: list[str]) -> tuple[int, DocSection | None]:
+    if not terms:
+        return 0, None
+
+    path_counts = Counter(_tokenize_text(page.path))
+    title_counts = Counter(_tokenize_text(page.title))
+    breadcrumb_counts = Counter(_tokenize_text(" ".join(page.breadcrumb)))
+    hint_counts = Counter(_tokenize_text(page.routing_hint()))
+    alias_counts = Counter(_tokenize_text(" ".join(page.aliases)))
+
+    score = 0
+    matched_terms = 0
+    for term in terms:
+        term_score = (
+            _score_counter(path_counts, term, weight=6, cap=3)
+            + _score_counter(title_counts, term, weight=10, cap=2)
+            + _score_counter(breadcrumb_counts, term, weight=4, cap=2)
+            + _score_counter(hint_counts, term, weight=7, cap=3)
+            + _score_counter(alias_counts, term, weight=7, cap=3)
+        )
+        if term_score:
+            matched_terms += 1
+            score += term_score
+
+    best_section = None
+    best_section_score = 0
+    for section in page.sections:
+        section_score = _score_section(section, terms)
+        if section_score > best_section_score:
+            best_section = section
+            best_section_score = section_score
+
+    if score == 0 and best_section_score == 0:
+        return 0, None
+
+    score += matched_terms * 8 + best_section_score
+
+    phrase = " ".join(terms)
+    if phrase:
+        if phrase in _normalized_phrase(page.title):
+            score += 12
+        elif phrase in _normalized_phrase(page.routing_hint()):
+            score += 8
+        elif phrase in _normalized_phrase(page.path):
+            score += 8
+        elif best_section is not None and phrase in _normalized_phrase(
+            best_section.content
+        ):
+            score += 4
+
+    return score, best_section
+
+
+def _set_descendant_counts(
+    sections_by_path: dict[str, NavSection],
+    section_path: str,
+) -> int:
+    section = sections_by_path[section_path]
+    page_count = 0
+    for child_kind, child_path in section.children:
+        if child_kind == "page":
+            page_count += 1
+        else:
+            page_count += _set_descendant_counts(sections_by_path, child_path)
+    sections_by_path[section_path] = replace(section, descendant_page_count=page_count)
+    return page_count
+
+
+@lru_cache(maxsize=1)
+def _docs_index() -> DocsIndex:
+    root = _resolve_docs_root()
+    if root is None:
+        return DocsIndex(pages_by_path={}, sections_by_path={})
+
+    try:
+        docs_config = json.loads((root / "docs.json").read_text(encoding="utf-8"))
+    except (OSError, UnicodeDecodeError, json.JSONDecodeError):
+        return DocsIndex(pages_by_path={}, sections_by_path={})
+
+    pages_by_path: dict[str, DocPage] = {}
+    sections_by_path: dict[str, NavSection] = {}
+    page_order = 0
+
+    def ensure_unique_section_path(base_path: str) -> str:
+        if base_path not in sections_by_path:
+            return base_path
+        suffix = 2
+        while f"{base_path}-{suffix}" in sections_by_path:
+            suffix += 1
+        return f"{base_path}-{suffix}"
+
+    def walk_pages(
+        items: list[Any],
+        *,
+        section_path: str,
+        section_title: str,
+        ancestor_breadcrumb: tuple[str, ...],
+    ) -> None:
+        nonlocal page_order
+        children: list[tuple[str, str]] = []
+        page_breadcrumb = ancestor_breadcrumb + (section_title,)
+
+        for item in items:
+            if isinstance(item, str):
+                route_path = item.strip("/")
+                if not route_path:
+                    continue
+                if route_path not in pages_by_path:
+                    page = _build_doc_page(
+                        root,
+                        route_path,
+                        breadcrumb=page_breadcrumb,
+                        order=page_order,
+                    )
+                    if page is not None:
+                        pages_by_path[route_path] = page
+                        page_order += 1
+                if route_path in pages_by_path:
+                    children.append(("page", route_path))
+                continue
+
+            if not isinstance(item, dict):
+                continue
+            group_title = str(item.get("group", "")).strip()
+            nested_pages = item.get("pages")
+            if not group_title or not isinstance(nested_pages, list):
+                continue
+
+            child_path = ensure_unique_section_path(
+                f"{section_path}/{_slugify(group_title)}"
+            )
+            walk_pages(
+                nested_pages,
+                section_path=child_path,
+                section_title=group_title,
+                ancestor_breadcrumb=page_breadcrumb,
+            )
+            children.append(("section", child_path))
+
+        sections_by_path[section_path] = NavSection(
+            path=section_path,
+            title=section_title,
+            breadcrumb=ancestor_breadcrumb,
+            children=tuple(children),
+        )
+
+    root_children: list[tuple[str, str]] = []
+    tabs = docs_config.get("navigation", {}).get("tabs", [])
+    for tab in tabs:
+        if not isinstance(tab, dict):
+            continue
+        tab_title = str(tab.get("tab", "")).strip() or "Docs"
+        for group in tab.get("groups", []):
+            if not isinstance(group, dict):
+                continue
+            group_title = str(group.get("group", "")).strip()
+            group_pages = group.get("pages")
+            if not group_title or not isinstance(group_pages, list):
+                continue
+            top_level_path = ensure_unique_section_path(
+                f"{_slugify(tab_title)}/{_slugify(group_title)}"
+            )
+            walk_pages(
+                group_pages,
+                section_path=top_level_path,
+                section_title=group_title,
+                ancestor_breadcrumb=(tab_title,),
+            )
+            root_children.append(("section", top_level_path))
+
+    sections_by_path[_ROOT_SECTION_PATH] = NavSection(
+        path=_ROOT_SECTION_PATH,
+        title="Docs",
+        breadcrumb=(),
+        children=tuple(root_children),
+    )
+    _set_descendant_counts(sections_by_path, _ROOT_SECTION_PATH)
+
+    return DocsIndex(pages_by_path=pages_by_path, sections_by_path=sections_by_path)
+
+
+def _get_page_or_404(path: str) -> DocPage:
+    page = _docs_index().pages_by_path.get(path.strip("/"))
+    if page is None:
+        raise HTTPException(status_code=404, detail=f"Unknown docs page: {path!r}")
+    return page
+
+
+def _find_section(page: DocPage, section: str) -> DocSection | None:
+    target = section.strip().lower()
+    for candidate in page.sections:
+        if candidate.slug.lower() == target or candidate.title.lower() == target:
+            return candidate
+    return None
+
+
+def _expand_nav_entries(
+    index: DocsIndex,
+    section_path: str,
+    depth: int,
+) -> list[dict]:
+    section = index.sections_by_path[section_path]
+    results: list[dict] = []
+    for child_kind, child_path in section.children:
+        if child_kind == "section":
+            child_section = index.sections_by_path[child_path]
+            results.append(child_section.to_mcp_dict())
+            if depth > 1:
+                results.extend(_expand_nav_entries(index, child_path, depth - 1))
+        else:
+            results.append(index.pages_by_path[child_path].to_catalog_dict())
+    return results


@traced_tool
-async def search_docs(query: str, limit: int = 10) -> list[dict]:
-    """Search the Dograh documentation by keyword and return ranked pages.
+async def list_docs(path: str | None = None, depth: int = 1) -> list[dict]:
+    """Browse the Dograh docs hierarchy before reading a page in full.

-    Use this when the caller asks "how do I configure X" / "where are the docs for Y" /
-    "what does Dograh say about Z" — anything that should land on a docs page
-    rather than a workspace resource. For workspace data (agents, recordings,
-    credentials), use ``list_workflows`` / ``list_recordings`` / ``list_credentials``
-    instead.
-
-    Args:
-        query: Free-form keywords (e.g. "TURN server", "elevenlabs voice").
-            Tokenized on non-alphanumeric characters; terms shorter than
-            2 characters are dropped.
-        limit: Max pages to return. Capped at 25 regardless of input;
-            default 10 keeps the payload small enough to inline in MCP.
-
-    Returns:
-        Up to ``limit`` results, sorted by descending relevance score.
-        Each entry has:
-          * ``path`` — repo-relative path (e.g. ``configurations/voice.mdx``)
-          * ``url`` — public docs URL (https://docs.dograh.com/...)
-          * ``title`` — page title (from Mintlify frontmatter when present)
-          * ``score`` — opaque integer relevance score
-          * ``snippet`` — ~240-char excerpt around the first term hit
+    ``path`` addresses navigation sections exposed by this tool. Page paths
+    returned by ``search_docs`` and ``read_doc`` are the published docs routes
+    instead, for example ``voice-agent/tools/mcp-tool``.
+    """
+    await authenticate_mcp_request()
+
+    if depth < 1 or depth > DOCS_LIST_MAX_DEPTH:
+        raise ValueError(f"`depth` must be between 1 and {DOCS_LIST_MAX_DEPTH}.")
+
+    index = _docs_index()
+    if not index.sections_by_path:
+        return []
+
+    if path is None:
+        return _expand_nav_entries(index, _ROOT_SECTION_PATH, depth)
+
+    normalized = path.strip("/")
+    if normalized in index.sections_by_path:
+        return _expand_nav_entries(index, normalized, depth)
+    if normalized in index.pages_by_path:
+        return [index.pages_by_path[normalized].to_catalog_dict()]
+
+    raise HTTPException(status_code=404, detail=f"Unknown docs section: {path!r}")
+
+
+@traced_tool
+async def read_doc(path: str, section: str | None = None) -> dict:
+    """Read one docs page after you have narrowed to a likely match."""
+    await authenticate_mcp_request()
+
+    if not isinstance(path, str) or not path.strip():
+        raise ValueError("`path` must be a non-empty string.")
+
+    page = _get_page_or_404(path)
+    active_section = None
+    if section is not None:
+        active_section = _find_section(page, section)
+        if active_section is None:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Unknown section {section!r} for docs page {path!r}",
+            )
+    return page.to_read_dict(section=active_section)
+
+
+@traced_tool
+async def search_docs(query: str, limit: int = 5) -> list[dict]:
+    """Search the Dograh documentation and return a lean ranked shortlist.
+
+    Use this first for keyword or acronym lookup. Once the right page looks
+    likely, call ``read_doc(path)`` instead of reasoning from summaries alone.
    """
-    # Authentication is consistent with the rest of the MCP tools and
-    # routes through the same rate-limiting path, even though docs are
-    # not org-scoped data.
    await authenticate_mcp_request()

    if not isinstance(query, str) or not query.strip():
-        raise ValueError("query must be a non-empty string.")
-
-    try:
-        effective_limit = int(limit)
-    except (TypeError, ValueError) as exc:
-        raise ValueError("limit must be an integer.") from exc
-    if effective_limit < 1:
-        raise ValueError("limit must be at least 1.")
-    effective_limit = min(effective_limit, DOCS_SEARCH_MAX_LIMIT)
+        raise ValueError("`query` must be a non-empty string.")
+    if limit < 1:
+        raise ValueError("`limit` must be at least 1.")

    terms = _tokenize_query(query)
    if not terms:
-        # The caller passed something like punctuation-only or only
-        # single-char tokens — surface an actionable error rather than
-        # silently returning everything.
        raise ValueError(
-            "query must contain at least one keyword of 2+ alphanumeric characters."
+            "`query` must contain at least one non-stopword alphanumeric term."
        )

-    corpus = _docs_corpus()
-    if not corpus:
-        # Tool is registered but docs aren't on disk — return empty
-        # rather than 500ing so the caller can degrade gracefully.
+    index = _docs_index()
+    if not index.pages_by_path:
        return []

-    scored: list[tuple[int, str, str, str]] = []
-    for rel_path, contents in corpus:
-        title = _extract_page_title(contents, fallback=rel_path)
-        body = _strip_frontmatter(contents)
-        score = _score_page(rel_path, title, body, terms)
+    capped_limit = min(limit, DOCS_SEARCH_MAX_LIMIT)
+    ranked: list[tuple[int, int, DocPage, DocSection | None]] = []
+    for page in index.pages_by_path.values():
+        score, best_section = _score_page(page, terms)
        if score <= 0:
            continue
-        scored.append((score, rel_path, title, body))
+        ranked.append((score, page.order, page, best_section))

-    scored.sort(key=lambda item: (-item[0], item[1]))
-
-    results: list[dict] = []
-    for score, rel_path, title, body in scored[:effective_limit]:
-        results.append(
-            {
-                "path": rel_path,
-                "url": _docs_url_for(rel_path),
-                "title": title,
-                "score": score,
-                "snippet": _build_snippet(body, terms),
-            }
-        )
-    return results
+    ranked.sort(key=lambda item: (-item[0], item[1], item[2].path))
+    return [
+        page.to_catalog_dict(section=best_section)
+        for _, _, page, best_section in ranked[:capped_limit]
+    ]
--- a/api/services/workflow/workflow_graph.py
+++ b/api/services/workflow/workflow_graph.py
@ -1,6 +1,6 @@
 import re
 from collections import Counter
-from typing import Any, Dict, List, Set
+from typing import Dict, List, Set

 from api.services.workflow.dto import EdgeDataDTO, NodeType, ReactFlowDTO
 from api.services.workflow.errors import ItemKind, WorkflowError
--- a/api/tests/test_mcp_docs_search.py
+++ b/api/tests/test_mcp_docs_search.py
@ -1,14 +1,4 @@
-"""Unit tests for the `search_docs` MCP tool.
-
-The tool reads the docs corpus from disk via ``_resolve_docs_root`` and
-caches it with ``functools.lru_cache``. These tests point the cache at
-a synthetic corpus per-test so the assertions don't depend on the real
-docs tree (which evolves) and the LRU cache doesn't leak state.
-
-`authenticate_mcp_request` is mocked so the tests don't need a live DB
-or a valid API key — mirroring the pattern in
-``test_mcp_save_workflow.py``.
-"""
+"""Unit tests for the MCP docs discovery tools."""

 from __future__ import annotations

@ -17,71 +7,152 @@ from pathlib import Path
 from unittest.mock import AsyncMock, patch

 import pytest
+from fastapi import HTTPException

 from api.mcp_server.tools import docs_search as docs_search_module
 from api.mcp_server.tools.docs_search import (
-    _docs_url_for,
+    _docs_index,
    _extract_page_title,
    _resolve_docs_root,
    _score_page,
    _strip_frontmatter,
    _tokenize_query,
+    list_docs,
+    read_doc,
    search_docs,
 )


-# ─── Fixtures ────────────────────────────────────────────────────────────
+def _clear_docs_caches() -> None:
+    docs_search_module._docs_index.cache_clear()


@pytest.fixture
 def fake_docs_root(tmp_path: Path) -> Path:
-    """Build a minimal docs tree on disk and point the tool at it."""
    docs_root = tmp_path / "docs"
    docs_root.mkdir()

-    (docs_root / "configurations").mkdir()
-    (docs_root / "configurations" / "voice.mdx").write_text(
+    (docs_root / "getting-started").mkdir()
+    (docs_root / "getting-started" / "index.mdx").write_text(
        "---\n"
-        'title: "Voice"\n'
+        'title: "Getting started"\n'
+        'description: "Start using Dograh."\n'
        "---\n\n"
-        "# Voice configuration\n\n"
-        "Dograh supports ElevenLabs and Cartesia TTS providers.\n"
-        "Configure the ElevenLabs voice_id in your workspace settings.\n",
+        "# Getting started\n\n"
+        "Welcome to Dograh.\n",
        encoding="utf-8",
    )
-    (docs_root / "configurations" / "transcriber.mdx").write_text(
+
+    (docs_root / "voice-agent").mkdir()
+    (docs_root / "voice-agent" / "introduction.mdx").write_text(
        "---\n"
-        'title: "Transcriber"\n'
+        'title: "Voice Agent Builder"\n'
+        'description: "Build conversational workflows."\n'
        "---\n\n"
-        "# Speech-to-text\n\nDeepgram is the default transcriber.\n",
+        "# Voice Agent Builder\n\n"
+        "Build workflows with nodes and tools.\n",
+        encoding="utf-8",
+    )
+
+    (docs_root / "voice-agent" / "tools").mkdir()
+    (docs_root / "voice-agent" / "tools" / "mcp-tool.mdx").write_text(
+        "---\n"
+        'title: "MCP Tool"\n'
+        'description: "Connect external MCP servers."\n'
+        'llm_hint: "Use for MCP server setup, remote tools, or model context protocol questions."\n'
+        "aliases:\n"
+        '  - "model context protocol"\n'
+        "---\n\n"
+        "# MCP Tool\n\n"
+        "Connect an external MCP server to your voice agent.\n\n"
+        "## Authentication\n\n"
+        "Provide the MCP endpoint URL and headers.\n",
        encoding="utf-8",
    )

    (docs_root / "deployment").mkdir()
-    (docs_root / "deployment" / "turn-server.mdx").write_text(
+    (docs_root / "deployment" / "docker.mdx").write_text(
        "---\n"
-        'title: "TURN server setup"\n'
+        'title: "Docker"\n'
+        'description: "Deploy Dograh with Docker."\n'
+        'llm_hint: "Use for Docker deployment, local setup, remote setup, TURN server, coturn, or WebRTC connectivity questions."\n'
+        "aliases:\n"
+        '  - "coturn"\n'
+        '  - "turn server"\n'
        "---\n\n"
-        "# TURN server\n\n"
-        "WebRTC requires a TURN server for NAT traversal. Coturn is the "
-        "recommended choice for self-hosted deployments.\n",
+        "# Docker\n\n"
+        "Run Dograh with Docker.\n\n"
+        "## Troubleshooting WebRTC Connectivity\n\n"
+        "If audio fails or ICE fails, configure a TURN server. Coturn is the recommended choice.\n",
        encoding="utf-8",
    )

-    # A non-doc file that must be ignored by the corpus loader.
-    (docs_root / "docs.json").write_text('{"name":"Dograh"}', encoding="utf-8")
+    # Hidden/orphaned docs page: present on disk but not in docs.json, so it
+    # must not be indexed by the MCP tools.
+    (docs_root / "internal-only.mdx").write_text(
+        "---\n"
+        'title: "Internal TURN Notes"\n'
+        "---\n\n"
+        "# Internal TURN Notes\n\n"
+        "This page mentions zyxinternalturntoken but is not user-facing.\n",
+        encoding="utf-8",
+    )

-    # Reset the LRU cache and pin the resolver to our tmp tree.
-    docs_search_module._docs_corpus.cache_clear()
+    (docs_root / "AGENTS.md").write_text("# Internal instructions\n", encoding="utf-8")
+
+    (docs_root / "docs.json").write_text(
+        """{
+  "navigation": {
+    "tabs": [
+      {
+        "tab": "Guides",
+        "groups": [
+          {
+            "group": "Getting started",
+            "pages": [
+              "getting-started/index"
+            ]
+          },
+          {
+            "group": "Voice Agent Builder",
+            "pages": [
+              "voice-agent/introduction",
+              {
+                "group": "Tools",
+                "pages": [
+                  "voice-agent/tools/mcp-tool"
+                ]
+              }
+            ]
+          }
+        ]
+      },
+      {
+        "tab": "Developer",
+        "groups": [
+          {
+            "group": "Deployment",
+            "pages": [
+              "deployment/docker"
+            ]
+          }
+        ]
+      }
+    ]
+  }
+}
+""",
+        encoding="utf-8",
+    )
+
+    _clear_docs_caches()
    with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(docs_root)}):
        yield docs_root
-    docs_search_module._docs_corpus.cache_clear()
+    _clear_docs_caches()


@pytest.fixture
 def authed_user():
-    """Stub ``authenticate_mcp_request`` so tests skip the API-key path."""
-
    class _FakeUser:
        selected_organization_id = 1
        id = 42
@ -93,18 +164,8 @@ def authed_user():
        yield _FakeUser()


-# ─── Pure helpers ────────────────────────────────────────────────────────
-
-
-def test_tokenize_query_strips_short_and_punct_terms():
-    """Punctuation and 1-char tokens must not bleed into the scorer.
-
-    A trailing `?` or stray `a` would otherwise match nearly every page
-    and flatten the relevance ranking.
-    """
-    assert _tokenize_query("How do I configure a TURN server?") == [
-        "how",
-        "do",
+def test_tokenize_query_dedupes_and_drops_stopwords():
+    assert _tokenize_query("How do I configure a TURN server TURN?") == [
        "configure",
        "turn",
        "server",
@ -121,155 +182,92 @@ def test_strip_frontmatter_removes_yaml_block():
    assert _strip_frontmatter(body).startswith("# Heading")


-def test_strip_frontmatter_passes_through_when_missing():
-    body = "# Just a heading\nbody text\n"
-    assert _strip_frontmatter(body) == body
-
-
 def test_extract_page_title_prefers_frontmatter():
    body = '---\ntitle: "Front Title"\n---\n\n# Heading Title\n'
    assert _extract_page_title(body, fallback="x.mdx") == "Front Title"


 def test_extract_page_title_falls_back_to_first_heading():
-    """When frontmatter is missing the first ATX heading is the next best
-    signal — better than just returning the filename, which often is
-    a slug not a human-readable title."""
    body = "# Heading Title\nbody\n"
    assert _extract_page_title(body, fallback="x.mdx") == "Heading Title"


-def test_extract_page_title_falls_back_to_filename_when_nothing_matches():
-    body = "plain prose with no heading or frontmatter"
-    assert _extract_page_title(body, fallback="x.mdx") == "x.mdx"
-
-
-def test_docs_url_for_strips_extension_and_index():
-    assert (
-        _docs_url_for("configurations/voice.mdx")
-        == "https://docs.dograh.com/configurations/voice"
-    )
-    assert (
-        _docs_url_for("getting-started/index.mdx")
-        == "https://docs.dograh.com/getting-started"
-    )
-
-
-def test_score_page_weights_title_above_body():
-    """Title hits must outweigh body hits — otherwise a long page that
-    incidentally mentions the term many times outranks the page whose
-    purpose IS the term."""
-    title_only = _score_page(
-        rel_path="other.mdx", title="TURN server", body="unrelated text", terms=["turn"]
-    )
-    body_only = _score_page(
-        rel_path="other.mdx",
-        title="Unrelated",
-        body="turn turn turn turn turn",
-        terms=["turn"],
-    )
-    assert title_only > body_only
-
-
-def test_score_page_returns_zero_when_no_terms_match():
-    assert (
-        _score_page(
-            rel_path="x.mdx", title="X", body="hello world", terms=["nonexistent"]
-        )
-        == 0
+def test_score_page_uses_llm_hint_and_aliases():
+    page = docs_search_module.DocPage(
+        path="deployment/docker",
+        file_path="deployment/docker.mdx",
+        title="Docker",
+        description="Deploy Dograh with Docker.",
+        llm_hint="Use for TURN server and coturn setup.",
+        aliases=("coturn",),
+        breadcrumb=("Developer", "Deployment"),
+        content="Docker deployment.",
+        sections=(
+            docs_search_module.DocSection(
+                title="Troubleshooting WebRTC Connectivity",
+                slug="troubleshooting-webrtc-connectivity",
+                level=2,
+                content="Configure a TURN server with coturn.",
+            ),
+        ),
+        order=0,
    )
+    score, section = _score_page(page, ["coturn"])
+    assert score > 0
+    assert section is not None
+    assert section.slug == "troubleshooting-webrtc-connectivity"


 def test_resolve_docs_root_honors_env_override(tmp_path: Path):
    docs = tmp_path / "custom_docs"
    docs.mkdir()
+    (docs / "docs.json").write_text("{}", encoding="utf-8")
    with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(docs)}):
        assert _resolve_docs_root() == docs.resolve()


-def test_resolve_docs_root_ignores_nonexistent_env_value(tmp_path: Path):
-    """A bogus env value must not crash the tool — fall back to discovery
-    (the real ``docs/`` in the repo) instead."""
-    with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(tmp_path / "nope")}):
-        # Walk-up discovery should land somewhere (the repo's actual docs)
-        # but we don't assert the exact path because it depends on where
-        # the tests are run; we just assert no crash and either None or a dir.
-        resolved = _resolve_docs_root()
-        assert resolved is None or resolved.is_dir()
-
-
-# ─── End-to-end tool behaviour ───────────────────────────────────────────
-
-
@pytest.mark.asyncio
-async def test_search_docs_ranks_turn_setup_first_for_turn_query(
+async def test_search_docs_ranks_turn_doc_and_uses_route_path(
    fake_docs_root, authed_user
 ):
-    """The page whose title and body are both about TURN must outrank
-    incidental mentions of related words on other pages."""
-    results = await search_docs("How do I set up a TURN server?")
-    assert results, "expected at least one result"
-    assert results[0]["path"] == "deployment/turn-server.mdx"
-    assert results[0]["url"] == "https://docs.dograh.com/deployment/turn-server"
-    assert "TURN server" in results[0]["title"]
-    assert "TURN" in results[0]["snippet"] or "turn" in results[0]["snippet"].lower()
+    results = await search_docs("How do I configure coturn for WebRTC?")
+    assert results
+    assert results[0]["path"] == "deployment/docker"
+    assert results[0]["section_slug"] == "troubleshooting-webrtc-connectivity"
+    assert "TURN server" in results[0]["llm_hint"]
+    assert "snippet" not in results[0]
+    assert "score" not in results[0]
+    assert "url" not in results[0]


@pytest.mark.asyncio
-async def test_search_docs_excludes_non_doc_files(fake_docs_root, authed_user):
-    """``docs.json`` must not appear — the corpus loader filters to
-    .mdx/.md only."""
-    results = await search_docs("Dograh")
-    paths = [r["path"] for r in results]
-    assert "docs.json" not in paths
-
-
-@pytest.mark.asyncio
-async def test_search_docs_returns_empty_when_no_match(fake_docs_root, authed_user):
-    results = await search_docs("xyzzy unrelated zzz")
+async def test_search_docs_indexes_only_docs_json_pages(fake_docs_root, authed_user):
+    results = await search_docs("zyxinternalturntoken")
    assert results == []


@pytest.mark.asyncio
 async def test_search_docs_respects_limit(fake_docs_root, authed_user):
-    """``limit=1`` must collapse the result list even if multiple pages
-    match."""
-    results = await search_docs("Dograh", limit=1)
+    results = await search_docs("dograh", limit=1)
    assert len(results) == 1


@pytest.mark.asyncio
-async def test_search_docs_clamps_limit_to_hard_cap(fake_docs_root, authed_user):
-    """A pathological large limit must be clamped to
-    ``DOCS_SEARCH_MAX_LIMIT`` (=25) so the payload stays bounded."""
-    # Drop in extra docs so there's headroom to verify the clamp.
-    for i in range(30):
-        (fake_docs_root / f"extra-{i}.mdx").write_text(
-            f"# Page {i}\nThis Dograh page covers configurations topic {i}.\n",
-            encoding="utf-8",
-        )
-    docs_search_module._docs_corpus.cache_clear()
-    results = await search_docs("Dograh", limit=999)
-    assert len(results) <= 25
+async def test_search_docs_returns_empty_when_no_match(fake_docs_root, authed_user):
+    assert await search_docs("xyzzy unrelated zzz") == []


@pytest.mark.asyncio
 async def test_search_docs_returns_empty_when_no_corpus(
    tmp_path, authed_user, monkeypatch
 ):
-    """If the docs directory doesn't exist on disk, the tool must
-    degrade to an empty list rather than raising — Docker images and
-    dev checkouts can disagree on layout."""
    nonexistent = tmp_path / "no-docs-here"
    monkeypatch.setenv("DOGRAH_DOCS_PATH", str(nonexistent))
-    # Also block the walk-up fallback by pointing the resolver at a
-    # tmp path with no `docs/` ancestor.
-    docs_search_module._docs_corpus.cache_clear()
+    _clear_docs_caches()
    with patch(
        "api.mcp_server.tools.docs_search._resolve_docs_root", return_value=None
    ):
-        results = await search_docs("anything")
-    assert results == []
+        assert await search_docs("anything") == []


@pytest.mark.asyncio
@ -279,16 +277,83 @@ async def test_search_docs_rejects_empty_query(fake_docs_root, authed_user):


@pytest.mark.asyncio
-async def test_search_docs_rejects_query_with_no_real_terms(
+async def test_search_docs_rejects_query_with_only_stopwords(
    fake_docs_root, authed_user
 ):
-    """A query like `"???"` tokenizes to nothing — surface an actionable
-    error rather than silently returning every page."""
-    with pytest.raises(ValueError, match="2\\+ alphanumeric"):
-        await search_docs("?? // !!")
+    with pytest.raises(ValueError, match="non-stopword"):
+        await search_docs("how do I")


@pytest.mark.asyncio
 async def test_search_docs_rejects_zero_limit(fake_docs_root, authed_user):
    with pytest.raises(ValueError, match="at least 1"):
        await search_docs("Dograh", limit=0)
+
+
+@pytest.mark.asyncio
+async def test_list_docs_returns_top_level_sections(fake_docs_root, authed_user):
+    results = await list_docs()
+    assert results[0]["kind"] == "section"
+    assert results[0]["path"] == "guides/getting-started"
+    assert results[1]["path"] == "guides/voice-agent-builder"
+
+
+@pytest.mark.asyncio
+async def test_list_docs_depth_expands_children(fake_docs_root, authed_user):
+    results = await list_docs("guides/voice-agent-builder", depth=2)
+    paths = [item["path"] for item in results]
+    assert "voice-agent/introduction" in paths
+    assert "guides/voice-agent-builder/tools" in paths
+    assert "voice-agent/tools/mcp-tool" in paths
+
+
+@pytest.mark.asyncio
+async def test_list_docs_rejects_unknown_section(fake_docs_root, authed_user):
+    with pytest.raises(HTTPException, match="Unknown docs section"):
+        await list_docs("nope")
+
+
+@pytest.mark.asyncio
+async def test_read_doc_returns_full_page_and_sections(fake_docs_root, authed_user):
+    result = await read_doc("deployment/docker")
+    assert result["path"] == "deployment/docker"
+    assert result["title"] == "Docker"
+    assert "url" not in result
+    section_slugs = [section["slug"] for section in result["sections"]]
+    assert "docker" in section_slugs
+    assert "troubleshooting-webrtc-connectivity" in section_slugs
+    assert "Coturn" in result["content"] or "coturn" in result["content"].lower()
+
+
+@pytest.mark.asyncio
+async def test_read_doc_can_target_section(fake_docs_root, authed_user):
+    result = await read_doc(
+        "deployment/docker",
+        section="troubleshooting-webrtc-connectivity",
+    )
+    assert result["section_slug"] == "troubleshooting-webrtc-connectivity"
+    assert "ICE fails" in result["content"] or "TURN server" in result["content"]
+    assert "Run Dograh with Docker." not in result["content"]
+
+
+@pytest.mark.asyncio
+async def test_read_doc_rejects_unknown_page(fake_docs_root, authed_user):
+    with pytest.raises(HTTPException, match="Unknown docs page"):
+        await read_doc("missing/page")
+
+
+@pytest.mark.asyncio
+async def test_read_doc_rejects_unknown_section(fake_docs_root, authed_user):
+    with pytest.raises(HTTPException, match="Unknown section"):
+        await read_doc("deployment/docker", section="missing-section")
+
+
+def test_docs_index_uses_docs_json_navigation(fake_docs_root):
+    index = _docs_index()
+    assert "internal-only" not in index.pages_by_path
+    assert "guides/voice-agent-builder/tools" in index.sections_by_path
+    assert index.pages_by_path["voice-agent/tools/mcp-tool"].breadcrumb == (
+        "Guides",
+        "Voice Agent Builder",
+        "Tools",
+    )