feat: improve docs search

This commit is contained in:
Abhishek Kumar 2026-05-20 18:18:05 +05:30
parent 4618af20b8
commit 5c638070e0
5 changed files with 876 additions and 404 deletions

View file

@ -16,6 +16,11 @@ You build and edit Dograh voice-AI workflows by emitting TypeScript that uses th
## Call order
### Reading documentation
1. `search_docs(query)` use first for keyword or acronym lookup when the user is asking how Dograh works or how to configure something.
2. `read_doc(path)` fetch the full page once one result looks likely. Prefer this over reasoning from search summaries alone.
3. `list_docs(path=None, depth=1)` use when the user wants to browse a topic area or when search terms are too vague. Returned section paths feed back into `list_docs`; returned page paths feed into `read_doc`.
### Editing an existing workflow
1. `list_workflows` locate the target workflow.
2. `get_workflow_code(workflow_id)` fetch the current source.

View file

@ -1,4 +1,5 @@
from fastmcp import FastMCP
from mcp.types import ToolAnnotations
from api.mcp_server.instructions import DOGRAH_MCP_INSTRUCTIONS
from api.mcp_server.tools.catalog import (
@ -8,7 +9,7 @@ from api.mcp_server.tools.catalog import (
list_tools,
)
from api.mcp_server.tools.create_workflow import create_workflow
from api.mcp_server.tools.docs_search import search_docs
from api.mcp_server.tools.docs_search import list_docs, read_doc, search_docs
from api.mcp_server.tools.get_workflow_code import get_workflow_code
from api.mcp_server.tools.node_types import get_node_type, list_node_types
from api.mcp_server.tools.save_workflow import save_workflow
@ -28,6 +29,15 @@ for _tool in (
list_tools,
list_workflows,
save_workflow,
search_docs,
):
mcp.tool(_tool)
_DOCS_TOOL_ANNOTATIONS = ToolAnnotations(
readOnlyHint=True,
idempotentHint=True,
destructiveHint=False,
openWorldHint=False,
)
for _tool in (list_docs, read_doc, search_docs):
mcp.tool(_tool, annotations=_DOCS_TOOL_ANNOTATIONS)

View file

@ -1,312 +1,704 @@
"""`search_docs` MCP tool — keyword search over the Mintlify docs tree.
"""MCP docs discovery tools over the Mintlify docs tree.
The docs are shipped into the API image (`COPY ./docs ./docs` in
`api/Dockerfile`), so this tool works for both source/dev runs and
Docker deployments. For source/dev runs we walk up from this file to
locate the `docs/` directory; for Docker we land on `/app/docs`. An
explicit `DOGRAH_DOCS_PATH` env var overrides discovery.
The docs surface is intentionally split into three steps:
The implementation is intentionally dependency-free: it does in-memory
keyword scoring rather than building a vector index. The docs corpus is
small (~100 .mdx files, ~140k LoC), so a per-call scan is well under
50 ms and avoids needing an embedding backend, vector store, or
background indexer for a tool that's called interactively from MCP.
- ``list_docs`` for lightweight navigation over the published hierarchy
- ``search_docs`` for keyword lookup across the visible docs catalog
- ``read_doc`` for the full content of one chosen page (or one section)
The runtime index is derived from ``docs/docs.json`` plus the referenced
``.mdx``/``.md`` files. That keeps navigation, ordering, and visibility in
sync with the published docs rather than indexing every file under ``docs/``.
"""
from __future__ import annotations
import json
import os
import re
from collections import Counter
from dataclasses import dataclass, replace
from functools import lru_cache
from pathlib import Path
from typing import Any
import yaml
from fastapi import HTTPException
from api.mcp_server.auth import authenticate_mcp_request
from api.mcp_server.tracing import traced_tool
# Public site for the rendered docs. Used to build a clickable URL per
# result; agents can hand the URL back to the user even if the local
# file isn't reachable.
DOCS_SITE_BASE_URL = "https://docs.dograh.com"
# Hard cap regardless of caller-supplied limit. Keeps the MCP response
# payload bounded; Mintlify search APIs use a similar 10-25 ceiling.
DOCS_SEARCH_MAX_LIMIT = 25
DOCS_LIST_MAX_DEPTH = 3
_ROOT_SECTION_PATH = "__root__"
# Heading-detection regex. Matches ATX headings (`# `, `## `, etc.) but
# not in-line `#` characters.
_TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
_FRONTMATTER_RE = re.compile(r"\A---\s*\n(.*?)\n---\s*\n?", re.DOTALL)
_HEADING_RE = re.compile(r"^(#{1,6})\s+(.*?)\s*$", re.MULTILINE)
_STOPWORDS = {
"a",
"an",
"and",
"are",
"at",
"be",
"by",
"can",
"do",
"for",
"from",
"how",
"i",
"if",
"in",
"is",
"it",
"me",
"my",
"of",
"on",
"or",
"the",
"to",
"what",
"when",
"where",
"with",
"you",
"your",
}
@dataclass(frozen=True)
class DocSection:
title: str
slug: str
level: int
content: str
@dataclass(frozen=True)
class DocPage:
path: str
file_path: str
title: str
description: str
llm_hint: str
aliases: tuple[str, ...]
breadcrumb: tuple[str, ...]
content: str
sections: tuple[DocSection, ...]
order: int
def breadcrumb_text(self) -> str:
return " > ".join(self.breadcrumb)
def routing_hint(self) -> str:
return self.llm_hint or self.description
def to_catalog_dict(self, section: DocSection | None = None) -> dict:
data = {
"kind": "page",
"path": self.path,
"title": self.title,
"breadcrumb": self.breadcrumb_text(),
"llm_hint": self.routing_hint(),
}
if section is not None:
data["section_title"] = section.title
data["section_slug"] = section.slug
return _compact_dict(data)
def to_read_dict(self, section: DocSection | None = None) -> dict:
active_section = section
content = self.content
if active_section is not None:
content = active_section.content
return _compact_dict(
{
"path": self.path,
"title": self.title,
"breadcrumb": self.breadcrumb_text(),
"llm_hint": self.routing_hint(),
"section_title": active_section.title if active_section else None,
"section_slug": active_section.slug if active_section else None,
"content": content,
"sections": [
{"title": sec.title, "slug": sec.slug}
for sec in self.sections
if sec.title and sec.slug
],
}
)
@dataclass(frozen=True)
class NavSection:
path: str
title: str
breadcrumb: tuple[str, ...]
children: tuple[tuple[str, str], ...]
descendant_page_count: int = 0
def breadcrumb_text(self) -> str:
return " > ".join(self.breadcrumb)
def to_mcp_dict(self) -> dict:
hint = None
if self.descendant_page_count:
hint = f"Browse {self.descendant_page_count} docs in this section."
return _compact_dict(
{
"kind": "section",
"path": self.path,
"title": self.title,
"breadcrumb": self.breadcrumb_text(),
"llm_hint": hint,
"has_children": bool(self.children),
"child_count": len(self.children),
"page_count": self.descendant_page_count,
}
)
@dataclass(frozen=True)
class DocsIndex:
pages_by_path: dict[str, DocPage]
sections_by_path: dict[str, NavSection]
def _compact_dict(data: dict[str, Any]) -> dict[str, Any]:
return {
key: value for key, value in data.items() if value not in (None, "", [], (), {})
}
def _slugify(value: str) -> str:
slug = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
return slug or "section"
def _coerce_docs_root(candidate: Path) -> Path | None:
candidate = candidate.expanduser().resolve()
if (candidate / "docs.json").is_file():
return candidate
nested = candidate / "docs"
if (nested / "docs.json").is_file():
return nested
return None
def _resolve_docs_root() -> Path | None:
"""Return the path to the on-disk docs tree, or None if not found.
Resolution order:
1. ``DOGRAH_DOCS_PATH`` env var (absolute path).
2. ``/app/docs`` the location the API Dockerfile copies docs to.
3. Walk upward from this file looking for a sibling ``docs/`` dir
(covers source-checkout / dev runs).
"""
"""Return the path to the on-disk docs tree, or None if not found."""
override = os.environ.get("DOGRAH_DOCS_PATH")
if override:
candidate = Path(override).expanduser().resolve()
if candidate.is_dir():
return candidate
resolved = _coerce_docs_root(Path(override))
if resolved is not None:
return resolved
docker_default = Path("/app/docs")
if docker_default.is_dir():
docker_default = _coerce_docs_root(Path("/app/docs"))
if docker_default is not None:
return docker_default
# Walk up from .../api/mcp_server/tools/docs_search.py looking for docs/.
for parent in Path(__file__).resolve().parents:
candidate = parent / "docs"
if candidate.is_dir():
return candidate
resolved = _coerce_docs_root(parent / "docs")
if resolved is not None:
return resolved
return None
@lru_cache(maxsize=1)
def _docs_corpus() -> tuple[tuple[str, str], ...]:
"""Load the docs corpus once per process.
Returns a tuple of ``(relative_path, file_contents)`` pairs. The
docs tree is small and read-mostly at runtime, so caching the full
text in memory is cheaper than re-reading on every search.
Cache miss is intentional when ``DOGRAH_DOCS_PATH`` flips at
startup for live edits, restart the process.
"""
root = _resolve_docs_root()
if root is None:
return ()
pairs: list[tuple[str, str]] = []
for path in sorted(root.rglob("*")):
if not path.is_file():
continue
if path.suffix.lower() not in {".mdx", ".md"}:
continue
try:
contents = path.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError):
# Skip unreadable files rather than crashing the whole tool.
continue
rel = path.relative_to(root).as_posix()
pairs.append((rel, contents))
return tuple(pairs)
def _tokenize_query(query: str) -> list[str]:
"""Split a user query into lowercased keyword terms.
Empty strings and 1-char filler terms are dropped they would
match almost every file and drown out the real signal.
"""
terms = re.findall(r"[A-Za-z0-9_]+", query.lower())
return [term for term in terms if len(term) >= 2]
def _extract_page_title(contents: str, fallback: str) -> str:
"""Pull a human-readable title for a docs page.
Mintlify pages start with a YAML frontmatter block whose ``title``
is the most authoritative title; fall back to the first ATX heading
if frontmatter is missing or malformed; fall back to the filename
if no heading exists.
"""
if contents.startswith("---"):
end = contents.find("---", 3)
if end != -1:
frontmatter = contents[3:end]
for line in frontmatter.splitlines():
line = line.strip()
if line.lower().startswith("title:"):
value = line.split(":", 1)[1].strip()
# Strip surrounding quotes if Mintlify wrote them.
if (
len(value) >= 2
and value[0] == value[-1]
and value[0] in ('"', "'")
):
value = value[1:-1]
if value:
return value
match = _HEADING_RE.search(contents)
if match:
return match.group(2).strip()
return fallback
def _split_frontmatter(contents: str) -> tuple[dict[str, Any], str]:
match = _FRONTMATTER_RE.match(contents)
if not match:
return {}, contents
try:
frontmatter = yaml.safe_load(match.group(1)) or {}
except yaml.YAMLError:
return {}, contents
if not isinstance(frontmatter, dict):
frontmatter = {}
return frontmatter, contents[match.end() :].lstrip("\n")
def _strip_frontmatter(contents: str) -> str:
"""Drop the YAML frontmatter block from a docs page body."""
if not contents.startswith("---"):
return contents
end = contents.find("---", 3)
if end == -1:
return contents
return contents[end + 3 :].lstrip("\n")
return _split_frontmatter(contents)[1]
def _build_snippet(body: str, terms: list[str], snippet_radius: int = 120) -> str:
"""Return a ~240-char window around the first term hit in ``body``.
def _clean_heading_text(raw: str) -> str:
text = re.sub(r"\s*\{#.*\}\s*$", "", raw.strip())
return " ".join(text.split())
The window is centered on the earliest match (whichever term comes
first wins) so the snippet shows context for the strongest signal,
not the lexicographically-first term. Leading/trailing newlines are
collapsed so the snippet renders cleanly through MCP's text payload.
"""
body_lower = body.lower()
earliest = -1
for term in terms:
idx = body_lower.find(term)
if idx != -1 and (earliest == -1 or idx < earliest):
earliest = idx
if earliest == -1:
# No hit in body — the match must have come from the title or
# path, so just return the first line of body as orientation.
first_line = next(
(line.strip() for line in body.splitlines() if line.strip()),
"",
def _extract_page_title(contents: str, fallback: str) -> str:
"""Pull a human-readable title for a docs page."""
frontmatter, body = _split_frontmatter(contents)
title = frontmatter.get("title")
if isinstance(title, str) and title.strip():
return title.strip()
match = _HEADING_RE.search(body)
if match:
return _clean_heading_text(match.group(2))
return fallback
def _normalize_text(value: Any) -> str:
if isinstance(value, str):
return " ".join(value.strip().split())
return ""
def _normalize_aliases(value: Any) -> tuple[str, ...]:
if isinstance(value, str):
aliases = [value]
elif isinstance(value, list):
aliases = [item for item in value if isinstance(item, str)]
else:
aliases = []
return tuple(alias.strip() for alias in aliases if alias.strip())
def _extract_sections(body: str) -> tuple[DocSection, ...]:
matches = list(_HEADING_RE.finditer(body))
stripped_body = body.strip()
if not matches:
if not stripped_body:
return ()
return (
DocSection(
title="Overview",
slug="overview",
level=1,
content=stripped_body,
),
)
return first_line[: snippet_radius * 2]
start = max(0, earliest - snippet_radius)
end = min(len(body), earliest + snippet_radius)
snippet = body[start:end]
# Collapse all whitespace runs (incl. internal newlines) for a
# single-line snippet — MCP renders text payloads inline.
snippet = " ".join(snippet.split())
prefix = "" if start > 0 else ""
suffix = "" if end < len(body) else ""
return f"{prefix}{snippet}{suffix}"
sections: list[DocSection] = []
preamble = body[: matches[0].start()].strip()
if preamble:
sections.append(
DocSection(
title="Overview",
slug="overview",
level=1,
content=preamble,
)
)
for index, match in enumerate(matches):
start = match.start()
end = matches[index + 1].start() if index + 1 < len(matches) else len(body)
title = _clean_heading_text(match.group(2))
sections.append(
DocSection(
title=title or "Section",
slug=_slugify(title or "section"),
level=len(match.group(1)),
content=body[start:end].strip(),
)
)
return tuple(sections)
def _score_page(
rel_path: str,
title: str,
body: str,
terms: list[str],
) -> int:
"""Weighted keyword score for a single docs page.
def _tokenize_text(text: str) -> list[str]:
return [
token
for token in _TOKEN_RE.findall(text.lower())
if len(token) >= 2 and token not in _STOPWORDS
]
Title/path matches outweigh body matches because they encode the
page's purpose, not just incidental mentions. Each query term
contributes independently a page matching all terms ranks above
one matching a single term many times.
"""
if not terms:
return 0
score = 0
path_lower = rel_path.lower()
title_lower = title.lower()
body_lower = body.lower()
for term in terms:
path_hits = path_lower.count(term)
title_hits = title_lower.count(term)
body_hits = body_lower.count(term)
if path_hits == 0 and title_hits == 0 and body_hits == 0:
# Penalize pages that miss any query term — they probably
# aren't what the caller wants.
def _tokenize_query(query: str) -> list[str]:
"""Split a user query into lowercased keyword terms."""
seen: set[str] = set()
terms: list[str] = []
for token in _TOKEN_RE.findall(query.lower()):
if len(token) < 2 or token in _STOPWORDS or token in seen:
continue
# Diminishing returns past a few hits per term: 1 dominant page
# shouldn't outweigh a page that hits every term. The cap is
# deliberately set so ``title_weight (5)`` strictly exceeds
# ``body_cap (4) × body_weight (1)`` — a page whose TITLE is the
# term must outrank a page that merely mentions it repeatedly.
body_hits = min(body_hits, 4)
score += path_hits * 8 + title_hits * 5 + body_hits
seen.add(token)
terms.append(token)
return terms
def _resolve_doc_file(root: Path, route_path: str) -> Path | None:
candidates = (
root / f"{route_path}.mdx",
root / f"{route_path}.md",
root / route_path / "index.mdx",
root / route_path / "index.md",
)
for candidate in candidates:
if candidate.is_file():
return candidate
return None
def _build_doc_page(
root: Path,
route_path: str,
*,
breadcrumb: tuple[str, ...],
order: int,
) -> DocPage | None:
file_path = _resolve_doc_file(root, route_path)
if file_path is None:
return None
try:
contents = file_path.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError):
return None
frontmatter, body = _split_frontmatter(contents)
fallback = route_path.rsplit("/", 1)[-1].replace("-", " ").title()
title = _extract_page_title(contents, fallback=fallback)
description = _normalize_text(frontmatter.get("description"))
llm_hint = _normalize_text(frontmatter.get("llm_hint"))
aliases = _normalize_aliases(frontmatter.get("aliases"))
content = body.strip()
return DocPage(
path=route_path,
file_path=file_path.relative_to(root).as_posix(),
title=title,
description=description,
llm_hint=llm_hint,
aliases=aliases,
breadcrumb=breadcrumb,
content=content,
sections=_extract_sections(content),
order=order,
)
def _score_counter(counter: Counter[str], term: str, *, weight: int, cap: int) -> int:
return min(counter.get(term, 0), cap) * weight
def _normalized_phrase(text: str) -> str:
return " ".join(_tokenize_text(text))
def _score_section(section: DocSection, terms: list[str]) -> int:
title_counts = Counter(_tokenize_text(section.title))
body_counts = Counter(_tokenize_text(section.content))
score = 0
matched_terms = 0
for term in terms:
term_score = _score_counter(
title_counts, term, weight=7, cap=2
) + _score_counter(body_counts, term, weight=1, cap=4)
if term_score:
matched_terms += 1
score += term_score
score += matched_terms * 4
phrase = " ".join(terms)
if phrase and phrase in _normalized_phrase(section.content):
score += 6
return score
def _docs_url_for(rel_path: str) -> str:
"""Build the public docs URL for a relative on-disk path."""
# Strip the extension and `index` so `getting-started/index.mdx`
# maps to `/getting-started`, matching Mintlify's routing.
no_ext = re.sub(r"\.(mdx|md)$", "", rel_path, flags=re.IGNORECASE)
if no_ext.endswith("/index"):
no_ext = no_ext[: -len("/index")]
return f"{DOCS_SITE_BASE_URL}/{no_ext}".rstrip("/")
def _score_page(page: DocPage, terms: list[str]) -> tuple[int, DocSection | None]:
if not terms:
return 0, None
path_counts = Counter(_tokenize_text(page.path))
title_counts = Counter(_tokenize_text(page.title))
breadcrumb_counts = Counter(_tokenize_text(" ".join(page.breadcrumb)))
hint_counts = Counter(_tokenize_text(page.routing_hint()))
alias_counts = Counter(_tokenize_text(" ".join(page.aliases)))
score = 0
matched_terms = 0
for term in terms:
term_score = (
_score_counter(path_counts, term, weight=6, cap=3)
+ _score_counter(title_counts, term, weight=10, cap=2)
+ _score_counter(breadcrumb_counts, term, weight=4, cap=2)
+ _score_counter(hint_counts, term, weight=7, cap=3)
+ _score_counter(alias_counts, term, weight=7, cap=3)
)
if term_score:
matched_terms += 1
score += term_score
best_section = None
best_section_score = 0
for section in page.sections:
section_score = _score_section(section, terms)
if section_score > best_section_score:
best_section = section
best_section_score = section_score
if score == 0 and best_section_score == 0:
return 0, None
score += matched_terms * 8 + best_section_score
phrase = " ".join(terms)
if phrase:
if phrase in _normalized_phrase(page.title):
score += 12
elif phrase in _normalized_phrase(page.routing_hint()):
score += 8
elif phrase in _normalized_phrase(page.path):
score += 8
elif best_section is not None and phrase in _normalized_phrase(
best_section.content
):
score += 4
return score, best_section
def _set_descendant_counts(
sections_by_path: dict[str, NavSection],
section_path: str,
) -> int:
section = sections_by_path[section_path]
page_count = 0
for child_kind, child_path in section.children:
if child_kind == "page":
page_count += 1
else:
page_count += _set_descendant_counts(sections_by_path, child_path)
sections_by_path[section_path] = replace(section, descendant_page_count=page_count)
return page_count
@lru_cache(maxsize=1)
def _docs_index() -> DocsIndex:
root = _resolve_docs_root()
if root is None:
return DocsIndex(pages_by_path={}, sections_by_path={})
try:
docs_config = json.loads((root / "docs.json").read_text(encoding="utf-8"))
except (OSError, UnicodeDecodeError, json.JSONDecodeError):
return DocsIndex(pages_by_path={}, sections_by_path={})
pages_by_path: dict[str, DocPage] = {}
sections_by_path: dict[str, NavSection] = {}
page_order = 0
def ensure_unique_section_path(base_path: str) -> str:
if base_path not in sections_by_path:
return base_path
suffix = 2
while f"{base_path}-{suffix}" in sections_by_path:
suffix += 1
return f"{base_path}-{suffix}"
def walk_pages(
items: list[Any],
*,
section_path: str,
section_title: str,
ancestor_breadcrumb: tuple[str, ...],
) -> None:
nonlocal page_order
children: list[tuple[str, str]] = []
page_breadcrumb = ancestor_breadcrumb + (section_title,)
for item in items:
if isinstance(item, str):
route_path = item.strip("/")
if not route_path:
continue
if route_path not in pages_by_path:
page = _build_doc_page(
root,
route_path,
breadcrumb=page_breadcrumb,
order=page_order,
)
if page is not None:
pages_by_path[route_path] = page
page_order += 1
if route_path in pages_by_path:
children.append(("page", route_path))
continue
if not isinstance(item, dict):
continue
group_title = str(item.get("group", "")).strip()
nested_pages = item.get("pages")
if not group_title or not isinstance(nested_pages, list):
continue
child_path = ensure_unique_section_path(
f"{section_path}/{_slugify(group_title)}"
)
walk_pages(
nested_pages,
section_path=child_path,
section_title=group_title,
ancestor_breadcrumb=page_breadcrumb,
)
children.append(("section", child_path))
sections_by_path[section_path] = NavSection(
path=section_path,
title=section_title,
breadcrumb=ancestor_breadcrumb,
children=tuple(children),
)
root_children: list[tuple[str, str]] = []
tabs = docs_config.get("navigation", {}).get("tabs", [])
for tab in tabs:
if not isinstance(tab, dict):
continue
tab_title = str(tab.get("tab", "")).strip() or "Docs"
for group in tab.get("groups", []):
if not isinstance(group, dict):
continue
group_title = str(group.get("group", "")).strip()
group_pages = group.get("pages")
if not group_title or not isinstance(group_pages, list):
continue
top_level_path = ensure_unique_section_path(
f"{_slugify(tab_title)}/{_slugify(group_title)}"
)
walk_pages(
group_pages,
section_path=top_level_path,
section_title=group_title,
ancestor_breadcrumb=(tab_title,),
)
root_children.append(("section", top_level_path))
sections_by_path[_ROOT_SECTION_PATH] = NavSection(
path=_ROOT_SECTION_PATH,
title="Docs",
breadcrumb=(),
children=tuple(root_children),
)
_set_descendant_counts(sections_by_path, _ROOT_SECTION_PATH)
return DocsIndex(pages_by_path=pages_by_path, sections_by_path=sections_by_path)
def _get_page_or_404(path: str) -> DocPage:
page = _docs_index().pages_by_path.get(path.strip("/"))
if page is None:
raise HTTPException(status_code=404, detail=f"Unknown docs page: {path!r}")
return page
def _find_section(page: DocPage, section: str) -> DocSection | None:
target = section.strip().lower()
for candidate in page.sections:
if candidate.slug.lower() == target or candidate.title.lower() == target:
return candidate
return None
def _expand_nav_entries(
index: DocsIndex,
section_path: str,
depth: int,
) -> list[dict]:
section = index.sections_by_path[section_path]
results: list[dict] = []
for child_kind, child_path in section.children:
if child_kind == "section":
child_section = index.sections_by_path[child_path]
results.append(child_section.to_mcp_dict())
if depth > 1:
results.extend(_expand_nav_entries(index, child_path, depth - 1))
else:
results.append(index.pages_by_path[child_path].to_catalog_dict())
return results
@traced_tool
async def search_docs(query: str, limit: int = 10) -> list[dict]:
"""Search the Dograh documentation by keyword and return ranked pages.
async def list_docs(path: str | None = None, depth: int = 1) -> list[dict]:
"""Browse the Dograh docs hierarchy before reading a page in full.
Use this when the caller asks "how do I configure X" / "where are the docs for Y" /
"what does Dograh say about Z" anything that should land on a docs page
rather than a workspace resource. For workspace data (agents, recordings,
credentials), use ``list_workflows`` / ``list_recordings`` / ``list_credentials``
instead.
Args:
query: Free-form keywords (e.g. "TURN server", "elevenlabs voice").
Tokenized on non-alphanumeric characters; terms shorter than
2 characters are dropped.
limit: Max pages to return. Capped at 25 regardless of input;
default 10 keeps the payload small enough to inline in MCP.
Returns:
Up to ``limit`` results, sorted by descending relevance score.
Each entry has:
* ``path`` repo-relative path (e.g. ``configurations/voice.mdx``)
* ``url`` public docs URL (https://docs.dograh.com/...)
* ``title`` page title (from Mintlify frontmatter when present)
* ``score`` opaque integer relevance score
* ``snippet`` ~240-char excerpt around the first term hit
``path`` addresses navigation sections exposed by this tool. Page paths
returned by ``search_docs`` and ``read_doc`` are the published docs routes
instead, for example ``voice-agent/tools/mcp-tool``.
"""
await authenticate_mcp_request()
if depth < 1 or depth > DOCS_LIST_MAX_DEPTH:
raise ValueError(f"`depth` must be between 1 and {DOCS_LIST_MAX_DEPTH}.")
index = _docs_index()
if not index.sections_by_path:
return []
if path is None:
return _expand_nav_entries(index, _ROOT_SECTION_PATH, depth)
normalized = path.strip("/")
if normalized in index.sections_by_path:
return _expand_nav_entries(index, normalized, depth)
if normalized in index.pages_by_path:
return [index.pages_by_path[normalized].to_catalog_dict()]
raise HTTPException(status_code=404, detail=f"Unknown docs section: {path!r}")
@traced_tool
async def read_doc(path: str, section: str | None = None) -> dict:
"""Read one docs page after you have narrowed to a likely match."""
await authenticate_mcp_request()
if not isinstance(path, str) or not path.strip():
raise ValueError("`path` must be a non-empty string.")
page = _get_page_or_404(path)
active_section = None
if section is not None:
active_section = _find_section(page, section)
if active_section is None:
raise HTTPException(
status_code=404,
detail=f"Unknown section {section!r} for docs page {path!r}",
)
return page.to_read_dict(section=active_section)
@traced_tool
async def search_docs(query: str, limit: int = 5) -> list[dict]:
"""Search the Dograh documentation and return a lean ranked shortlist.
Use this first for keyword or acronym lookup. Once the right page looks
likely, call ``read_doc(path)`` instead of reasoning from summaries alone.
"""
# Authentication is consistent with the rest of the MCP tools and
# routes through the same rate-limiting path, even though docs are
# not org-scoped data.
await authenticate_mcp_request()
if not isinstance(query, str) or not query.strip():
raise ValueError("query must be a non-empty string.")
try:
effective_limit = int(limit)
except (TypeError, ValueError) as exc:
raise ValueError("limit must be an integer.") from exc
if effective_limit < 1:
raise ValueError("limit must be at least 1.")
effective_limit = min(effective_limit, DOCS_SEARCH_MAX_LIMIT)
raise ValueError("`query` must be a non-empty string.")
if limit < 1:
raise ValueError("`limit` must be at least 1.")
terms = _tokenize_query(query)
if not terms:
# The caller passed something like punctuation-only or only
# single-char tokens — surface an actionable error rather than
# silently returning everything.
raise ValueError(
"query must contain at least one keyword of 2+ alphanumeric characters."
"`query` must contain at least one non-stopword alphanumeric term."
)
corpus = _docs_corpus()
if not corpus:
# Tool is registered but docs aren't on disk — return empty
# rather than 500ing so the caller can degrade gracefully.
index = _docs_index()
if not index.pages_by_path:
return []
scored: list[tuple[int, str, str, str]] = []
for rel_path, contents in corpus:
title = _extract_page_title(contents, fallback=rel_path)
body = _strip_frontmatter(contents)
score = _score_page(rel_path, title, body, terms)
capped_limit = min(limit, DOCS_SEARCH_MAX_LIMIT)
ranked: list[tuple[int, int, DocPage, DocSection | None]] = []
for page in index.pages_by_path.values():
score, best_section = _score_page(page, terms)
if score <= 0:
continue
scored.append((score, rel_path, title, body))
ranked.append((score, page.order, page, best_section))
scored.sort(key=lambda item: (-item[0], item[1]))
results: list[dict] = []
for score, rel_path, title, body in scored[:effective_limit]:
results.append(
{
"path": rel_path,
"url": _docs_url_for(rel_path),
"title": title,
"score": score,
"snippet": _build_snippet(body, terms),
}
)
return results
ranked.sort(key=lambda item: (-item[0], item[1], item[2].path))
return [
page.to_catalog_dict(section=best_section)
for _, _, page, best_section in ranked[:capped_limit]
]

View file

@ -1,6 +1,6 @@
import re
from collections import Counter
from typing import Any, Dict, List, Set
from typing import Dict, List, Set
from api.services.workflow.dto import EdgeDataDTO, NodeType, ReactFlowDTO
from api.services.workflow.errors import ItemKind, WorkflowError

View file

@ -1,14 +1,4 @@
"""Unit tests for the `search_docs` MCP tool.
The tool reads the docs corpus from disk via ``_resolve_docs_root`` and
caches it with ``functools.lru_cache``. These tests point the cache at
a synthetic corpus per-test so the assertions don't depend on the real
docs tree (which evolves) and the LRU cache doesn't leak state.
`authenticate_mcp_request` is mocked so the tests don't need a live DB
or a valid API key mirroring the pattern in
``test_mcp_save_workflow.py``.
"""
"""Unit tests for the MCP docs discovery tools."""
from __future__ import annotations
@ -17,71 +7,152 @@ from pathlib import Path
from unittest.mock import AsyncMock, patch
import pytest
from fastapi import HTTPException
from api.mcp_server.tools import docs_search as docs_search_module
from api.mcp_server.tools.docs_search import (
_docs_url_for,
_docs_index,
_extract_page_title,
_resolve_docs_root,
_score_page,
_strip_frontmatter,
_tokenize_query,
list_docs,
read_doc,
search_docs,
)
# ─── Fixtures ────────────────────────────────────────────────────────────
def _clear_docs_caches() -> None:
docs_search_module._docs_index.cache_clear()
@pytest.fixture
def fake_docs_root(tmp_path: Path) -> Path:
"""Build a minimal docs tree on disk and point the tool at it."""
docs_root = tmp_path / "docs"
docs_root.mkdir()
(docs_root / "configurations").mkdir()
(docs_root / "configurations" / "voice.mdx").write_text(
(docs_root / "getting-started").mkdir()
(docs_root / "getting-started" / "index.mdx").write_text(
"---\n"
'title: "Voice"\n'
'title: "Getting started"\n'
'description: "Start using Dograh."\n'
"---\n\n"
"# Voice configuration\n\n"
"Dograh supports ElevenLabs and Cartesia TTS providers.\n"
"Configure the ElevenLabs voice_id in your workspace settings.\n",
"# Getting started\n\n"
"Welcome to Dograh.\n",
encoding="utf-8",
)
(docs_root / "configurations" / "transcriber.mdx").write_text(
(docs_root / "voice-agent").mkdir()
(docs_root / "voice-agent" / "introduction.mdx").write_text(
"---\n"
'title: "Transcriber"\n'
'title: "Voice Agent Builder"\n'
'description: "Build conversational workflows."\n'
"---\n\n"
"# Speech-to-text\n\nDeepgram is the default transcriber.\n",
"# Voice Agent Builder\n\n"
"Build workflows with nodes and tools.\n",
encoding="utf-8",
)
(docs_root / "voice-agent" / "tools").mkdir()
(docs_root / "voice-agent" / "tools" / "mcp-tool.mdx").write_text(
"---\n"
'title: "MCP Tool"\n'
'description: "Connect external MCP servers."\n'
'llm_hint: "Use for MCP server setup, remote tools, or model context protocol questions."\n'
"aliases:\n"
' - "model context protocol"\n'
"---\n\n"
"# MCP Tool\n\n"
"Connect an external MCP server to your voice agent.\n\n"
"## Authentication\n\n"
"Provide the MCP endpoint URL and headers.\n",
encoding="utf-8",
)
(docs_root / "deployment").mkdir()
(docs_root / "deployment" / "turn-server.mdx").write_text(
(docs_root / "deployment" / "docker.mdx").write_text(
"---\n"
'title: "TURN server setup"\n'
'title: "Docker"\n'
'description: "Deploy Dograh with Docker."\n'
'llm_hint: "Use for Docker deployment, local setup, remote setup, TURN server, coturn, or WebRTC connectivity questions."\n'
"aliases:\n"
' - "coturn"\n'
' - "turn server"\n'
"---\n\n"
"# TURN server\n\n"
"WebRTC requires a TURN server for NAT traversal. Coturn is the "
"recommended choice for self-hosted deployments.\n",
"# Docker\n\n"
"Run Dograh with Docker.\n\n"
"## Troubleshooting WebRTC Connectivity\n\n"
"If audio fails or ICE fails, configure a TURN server. Coturn is the recommended choice.\n",
encoding="utf-8",
)
# A non-doc file that must be ignored by the corpus loader.
(docs_root / "docs.json").write_text('{"name":"Dograh"}', encoding="utf-8")
# Hidden/orphaned docs page: present on disk but not in docs.json, so it
# must not be indexed by the MCP tools.
(docs_root / "internal-only.mdx").write_text(
"---\n"
'title: "Internal TURN Notes"\n'
"---\n\n"
"# Internal TURN Notes\n\n"
"This page mentions zyxinternalturntoken but is not user-facing.\n",
encoding="utf-8",
)
# Reset the LRU cache and pin the resolver to our tmp tree.
docs_search_module._docs_corpus.cache_clear()
(docs_root / "AGENTS.md").write_text("# Internal instructions\n", encoding="utf-8")
(docs_root / "docs.json").write_text(
"""{
"navigation": {
"tabs": [
{
"tab": "Guides",
"groups": [
{
"group": "Getting started",
"pages": [
"getting-started/index"
]
},
{
"group": "Voice Agent Builder",
"pages": [
"voice-agent/introduction",
{
"group": "Tools",
"pages": [
"voice-agent/tools/mcp-tool"
]
}
]
}
]
},
{
"tab": "Developer",
"groups": [
{
"group": "Deployment",
"pages": [
"deployment/docker"
]
}
]
}
]
}
}
""",
encoding="utf-8",
)
_clear_docs_caches()
with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(docs_root)}):
yield docs_root
docs_search_module._docs_corpus.cache_clear()
_clear_docs_caches()
@pytest.fixture
def authed_user():
"""Stub ``authenticate_mcp_request`` so tests skip the API-key path."""
class _FakeUser:
selected_organization_id = 1
id = 42
@ -93,18 +164,8 @@ def authed_user():
yield _FakeUser()
# ─── Pure helpers ────────────────────────────────────────────────────────
def test_tokenize_query_strips_short_and_punct_terms():
"""Punctuation and 1-char tokens must not bleed into the scorer.
A trailing `?` or stray `a` would otherwise match nearly every page
and flatten the relevance ranking.
"""
assert _tokenize_query("How do I configure a TURN server?") == [
"how",
"do",
def test_tokenize_query_dedupes_and_drops_stopwords():
assert _tokenize_query("How do I configure a TURN server TURN?") == [
"configure",
"turn",
"server",
@ -121,155 +182,92 @@ def test_strip_frontmatter_removes_yaml_block():
assert _strip_frontmatter(body).startswith("# Heading")
def test_strip_frontmatter_passes_through_when_missing():
body = "# Just a heading\nbody text\n"
assert _strip_frontmatter(body) == body
def test_extract_page_title_prefers_frontmatter():
body = '---\ntitle: "Front Title"\n---\n\n# Heading Title\n'
assert _extract_page_title(body, fallback="x.mdx") == "Front Title"
def test_extract_page_title_falls_back_to_first_heading():
"""When frontmatter is missing the first ATX heading is the next best
signal better than just returning the filename, which often is
a slug not a human-readable title."""
body = "# Heading Title\nbody\n"
assert _extract_page_title(body, fallback="x.mdx") == "Heading Title"
def test_extract_page_title_falls_back_to_filename_when_nothing_matches():
body = "plain prose with no heading or frontmatter"
assert _extract_page_title(body, fallback="x.mdx") == "x.mdx"
def test_docs_url_for_strips_extension_and_index():
assert (
_docs_url_for("configurations/voice.mdx")
== "https://docs.dograh.com/configurations/voice"
)
assert (
_docs_url_for("getting-started/index.mdx")
== "https://docs.dograh.com/getting-started"
)
def test_score_page_weights_title_above_body():
"""Title hits must outweigh body hits — otherwise a long page that
incidentally mentions the term many times outranks the page whose
purpose IS the term."""
title_only = _score_page(
rel_path="other.mdx", title="TURN server", body="unrelated text", terms=["turn"]
)
body_only = _score_page(
rel_path="other.mdx",
title="Unrelated",
body="turn turn turn turn turn",
terms=["turn"],
)
assert title_only > body_only
def test_score_page_returns_zero_when_no_terms_match():
assert (
_score_page(
rel_path="x.mdx", title="X", body="hello world", terms=["nonexistent"]
)
== 0
def test_score_page_uses_llm_hint_and_aliases():
page = docs_search_module.DocPage(
path="deployment/docker",
file_path="deployment/docker.mdx",
title="Docker",
description="Deploy Dograh with Docker.",
llm_hint="Use for TURN server and coturn setup.",
aliases=("coturn",),
breadcrumb=("Developer", "Deployment"),
content="Docker deployment.",
sections=(
docs_search_module.DocSection(
title="Troubleshooting WebRTC Connectivity",
slug="troubleshooting-webrtc-connectivity",
level=2,
content="Configure a TURN server with coturn.",
),
),
order=0,
)
score, section = _score_page(page, ["coturn"])
assert score > 0
assert section is not None
assert section.slug == "troubleshooting-webrtc-connectivity"
def test_resolve_docs_root_honors_env_override(tmp_path: Path):
docs = tmp_path / "custom_docs"
docs.mkdir()
(docs / "docs.json").write_text("{}", encoding="utf-8")
with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(docs)}):
assert _resolve_docs_root() == docs.resolve()
def test_resolve_docs_root_ignores_nonexistent_env_value(tmp_path: Path):
"""A bogus env value must not crash the tool — fall back to discovery
(the real ``docs/`` in the repo) instead."""
with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(tmp_path / "nope")}):
# Walk-up discovery should land somewhere (the repo's actual docs)
# but we don't assert the exact path because it depends on where
# the tests are run; we just assert no crash and either None or a dir.
resolved = _resolve_docs_root()
assert resolved is None or resolved.is_dir()
# ─── End-to-end tool behaviour ───────────────────────────────────────────
@pytest.mark.asyncio
async def test_search_docs_ranks_turn_setup_first_for_turn_query(
async def test_search_docs_ranks_turn_doc_and_uses_route_path(
fake_docs_root, authed_user
):
"""The page whose title and body are both about TURN must outrank
incidental mentions of related words on other pages."""
results = await search_docs("How do I set up a TURN server?")
assert results, "expected at least one result"
assert results[0]["path"] == "deployment/turn-server.mdx"
assert results[0]["url"] == "https://docs.dograh.com/deployment/turn-server"
assert "TURN server" in results[0]["title"]
assert "TURN" in results[0]["snippet"] or "turn" in results[0]["snippet"].lower()
results = await search_docs("How do I configure coturn for WebRTC?")
assert results
assert results[0]["path"] == "deployment/docker"
assert results[0]["section_slug"] == "troubleshooting-webrtc-connectivity"
assert "TURN server" in results[0]["llm_hint"]
assert "snippet" not in results[0]
assert "score" not in results[0]
assert "url" not in results[0]
@pytest.mark.asyncio
async def test_search_docs_excludes_non_doc_files(fake_docs_root, authed_user):
"""``docs.json`` must not appear — the corpus loader filters to
.mdx/.md only."""
results = await search_docs("Dograh")
paths = [r["path"] for r in results]
assert "docs.json" not in paths
@pytest.mark.asyncio
async def test_search_docs_returns_empty_when_no_match(fake_docs_root, authed_user):
results = await search_docs("xyzzy unrelated zzz")
async def test_search_docs_indexes_only_docs_json_pages(fake_docs_root, authed_user):
results = await search_docs("zyxinternalturntoken")
assert results == []
@pytest.mark.asyncio
async def test_search_docs_respects_limit(fake_docs_root, authed_user):
"""``limit=1`` must collapse the result list even if multiple pages
match."""
results = await search_docs("Dograh", limit=1)
results = await search_docs("dograh", limit=1)
assert len(results) == 1
@pytest.mark.asyncio
async def test_search_docs_clamps_limit_to_hard_cap(fake_docs_root, authed_user):
"""A pathological large limit must be clamped to
``DOCS_SEARCH_MAX_LIMIT`` (=25) so the payload stays bounded."""
# Drop in extra docs so there's headroom to verify the clamp.
for i in range(30):
(fake_docs_root / f"extra-{i}.mdx").write_text(
f"# Page {i}\nThis Dograh page covers configurations topic {i}.\n",
encoding="utf-8",
)
docs_search_module._docs_corpus.cache_clear()
results = await search_docs("Dograh", limit=999)
assert len(results) <= 25
async def test_search_docs_returns_empty_when_no_match(fake_docs_root, authed_user):
assert await search_docs("xyzzy unrelated zzz") == []
@pytest.mark.asyncio
async def test_search_docs_returns_empty_when_no_corpus(
tmp_path, authed_user, monkeypatch
):
"""If the docs directory doesn't exist on disk, the tool must
degrade to an empty list rather than raising Docker images and
dev checkouts can disagree on layout."""
nonexistent = tmp_path / "no-docs-here"
monkeypatch.setenv("DOGRAH_DOCS_PATH", str(nonexistent))
# Also block the walk-up fallback by pointing the resolver at a
# tmp path with no `docs/` ancestor.
docs_search_module._docs_corpus.cache_clear()
_clear_docs_caches()
with patch(
"api.mcp_server.tools.docs_search._resolve_docs_root", return_value=None
):
results = await search_docs("anything")
assert results == []
assert await search_docs("anything") == []
@pytest.mark.asyncio
@ -279,16 +277,83 @@ async def test_search_docs_rejects_empty_query(fake_docs_root, authed_user):
@pytest.mark.asyncio
async def test_search_docs_rejects_query_with_no_real_terms(
async def test_search_docs_rejects_query_with_only_stopwords(
fake_docs_root, authed_user
):
"""A query like `"???"` tokenizes to nothing — surface an actionable
error rather than silently returning every page."""
with pytest.raises(ValueError, match="2\\+ alphanumeric"):
await search_docs("?? // !!")
with pytest.raises(ValueError, match="non-stopword"):
await search_docs("how do I")
@pytest.mark.asyncio
async def test_search_docs_rejects_zero_limit(fake_docs_root, authed_user):
with pytest.raises(ValueError, match="at least 1"):
await search_docs("Dograh", limit=0)
@pytest.mark.asyncio
async def test_list_docs_returns_top_level_sections(fake_docs_root, authed_user):
results = await list_docs()
assert results[0]["kind"] == "section"
assert results[0]["path"] == "guides/getting-started"
assert results[1]["path"] == "guides/voice-agent-builder"
@pytest.mark.asyncio
async def test_list_docs_depth_expands_children(fake_docs_root, authed_user):
results = await list_docs("guides/voice-agent-builder", depth=2)
paths = [item["path"] for item in results]
assert "voice-agent/introduction" in paths
assert "guides/voice-agent-builder/tools" in paths
assert "voice-agent/tools/mcp-tool" in paths
@pytest.mark.asyncio
async def test_list_docs_rejects_unknown_section(fake_docs_root, authed_user):
with pytest.raises(HTTPException, match="Unknown docs section"):
await list_docs("nope")
@pytest.mark.asyncio
async def test_read_doc_returns_full_page_and_sections(fake_docs_root, authed_user):
result = await read_doc("deployment/docker")
assert result["path"] == "deployment/docker"
assert result["title"] == "Docker"
assert "url" not in result
section_slugs = [section["slug"] for section in result["sections"]]
assert "docker" in section_slugs
assert "troubleshooting-webrtc-connectivity" in section_slugs
assert "Coturn" in result["content"] or "coturn" in result["content"].lower()
@pytest.mark.asyncio
async def test_read_doc_can_target_section(fake_docs_root, authed_user):
result = await read_doc(
"deployment/docker",
section="troubleshooting-webrtc-connectivity",
)
assert result["section_slug"] == "troubleshooting-webrtc-connectivity"
assert "ICE fails" in result["content"] or "TURN server" in result["content"]
assert "Run Dograh with Docker." not in result["content"]
@pytest.mark.asyncio
async def test_read_doc_rejects_unknown_page(fake_docs_root, authed_user):
with pytest.raises(HTTPException, match="Unknown docs page"):
await read_doc("missing/page")
@pytest.mark.asyncio
async def test_read_doc_rejects_unknown_section(fake_docs_root, authed_user):
with pytest.raises(HTTPException, match="Unknown section"):
await read_doc("deployment/docker", section="missing-section")
def test_docs_index_uses_docs_json_navigation(fake_docs_root):
index = _docs_index()
assert "internal-only" not in index.pages_by_path
assert "guides/voice-agent-builder/tools" in index.sections_by_path
assert index.pages_by_path["voice-agent/tools/mcp-tool"].breadcrumb == (
"Guides",
"Voice Agent Builder",
"Tools",
)