feat: improve docs search

This commit is contained in:
Abhishek Kumar 2026-05-20 18:18:05 +05:30
parent 4618af20b8
commit 5c638070e0
5 changed files with 876 additions and 404 deletions

View file

@ -16,6 +16,11 @@ You build and edit Dograh voice-AI workflows by emitting TypeScript that uses th
## Call order
### Reading documentation
1. `search_docs(query)` use first for keyword or acronym lookup when the user is asking how Dograh works or how to configure something.
2. `read_doc(path)` fetch the full page once one result looks likely. Prefer this over reasoning from search summaries alone.
3. `list_docs(path=None, depth=1)` use when the user wants to browse a topic area or when search terms are too vague. Returned section paths feed back into `list_docs`; returned page paths feed into `read_doc`.
### Editing an existing workflow
1. `list_workflows` locate the target workflow.
2. `get_workflow_code(workflow_id)` fetch the current source.

View file

@ -1,4 +1,5 @@
from fastmcp import FastMCP
from mcp.types import ToolAnnotations
from api.mcp_server.instructions import DOGRAH_MCP_INSTRUCTIONS
from api.mcp_server.tools.catalog import (
@ -8,7 +9,7 @@ from api.mcp_server.tools.catalog import (
list_tools,
)
from api.mcp_server.tools.create_workflow import create_workflow
from api.mcp_server.tools.docs_search import search_docs
from api.mcp_server.tools.docs_search import list_docs, read_doc, search_docs
from api.mcp_server.tools.get_workflow_code import get_workflow_code
from api.mcp_server.tools.node_types import get_node_type, list_node_types
from api.mcp_server.tools.save_workflow import save_workflow
@ -28,6 +29,15 @@ for _tool in (
list_tools,
list_workflows,
save_workflow,
search_docs,
):
mcp.tool(_tool)
_DOCS_TOOL_ANNOTATIONS = ToolAnnotations(
readOnlyHint=True,
idempotentHint=True,
destructiveHint=False,
openWorldHint=False,
)
for _tool in (list_docs, read_doc, search_docs):
mcp.tool(_tool, annotations=_DOCS_TOOL_ANNOTATIONS)

View file

@ -1,312 +1,704 @@
"""`search_docs` MCP tool — keyword search over the Mintlify docs tree.
"""MCP docs discovery tools over the Mintlify docs tree.
The docs are shipped into the API image (`COPY ./docs ./docs` in
`api/Dockerfile`), so this tool works for both source/dev runs and
Docker deployments. For source/dev runs we walk up from this file to
locate the `docs/` directory; for Docker we land on `/app/docs`. An
explicit `DOGRAH_DOCS_PATH` env var overrides discovery.
The docs surface is intentionally split into three steps:
The implementation is intentionally dependency-free: it does in-memory
keyword scoring rather than building a vector index. The docs corpus is
small (~100 .mdx files, ~140k LoC), so a per-call scan is well under
50 ms and avoids needing an embedding backend, vector store, or
background indexer for a tool that's called interactively from MCP.
- ``list_docs`` for lightweight navigation over the published hierarchy
- ``search_docs`` for keyword lookup across the visible docs catalog
- ``read_doc`` for the full content of one chosen page (or one section)
The runtime index is derived from ``docs/docs.json`` plus the referenced
``.mdx``/``.md`` files. That keeps navigation, ordering, and visibility in
sync with the published docs rather than indexing every file under ``docs/``.
"""
from __future__ import annotations
import json
import os
import re
from collections import Counter
from dataclasses import dataclass, replace
from functools import lru_cache
from pathlib import Path
from typing import Any
import yaml
from fastapi import HTTPException
from api.mcp_server.auth import authenticate_mcp_request
from api.mcp_server.tracing import traced_tool
# Public site for the rendered docs. Used to build a clickable URL per
# result; agents can hand the URL back to the user even if the local
# file isn't reachable.
DOCS_SITE_BASE_URL = "https://docs.dograh.com"
# Hard cap regardless of caller-supplied limit. Keeps the MCP response
# payload bounded; Mintlify search APIs use a similar 10-25 ceiling.
DOCS_SEARCH_MAX_LIMIT = 25
DOCS_LIST_MAX_DEPTH = 3
_ROOT_SECTION_PATH = "__root__"
# Heading-detection regex. Matches ATX headings (`# `, `## `, etc.) but
# not in-line `#` characters.
_TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
_FRONTMATTER_RE = re.compile(r"\A---\s*\n(.*?)\n---\s*\n?", re.DOTALL)
_HEADING_RE = re.compile(r"^(#{1,6})\s+(.*?)\s*$", re.MULTILINE)
_STOPWORDS = {
"a",
"an",
"and",
"are",
"at",
"be",
"by",
"can",
"do",
"for",
"from",
"how",
"i",
"if",
"in",
"is",
"it",
"me",
"my",
"of",
"on",
"or",
"the",
"to",
"what",
"when",
"where",
"with",
"you",
"your",
}
@dataclass(frozen=True)
class DocSection:
title: str
slug: str
level: int
content: str
@dataclass(frozen=True)
class DocPage:
path: str
file_path: str
title: str
description: str
llm_hint: str
aliases: tuple[str, ...]
breadcrumb: tuple[str, ...]
content: str
sections: tuple[DocSection, ...]
order: int
def breadcrumb_text(self) -> str:
return " > ".join(self.breadcrumb)
def routing_hint(self) -> str:
return self.llm_hint or self.description
def to_catalog_dict(self, section: DocSection | None = None) -> dict:
data = {
"kind": "page",
"path": self.path,
"title": self.title,
"breadcrumb": self.breadcrumb_text(),
"llm_hint": self.routing_hint(),
}
if section is not None:
data["section_title"] = section.title
data["section_slug"] = section.slug
return _compact_dict(data)
def to_read_dict(self, section: DocSection | None = None) -> dict:
active_section = section
content = self.content
if active_section is not None:
content = active_section.content
return _compact_dict(
{
"path": self.path,
"title": self.title,
"breadcrumb": self.breadcrumb_text(),
"llm_hint": self.routing_hint(),
"section_title": active_section.title if active_section else None,
"section_slug": active_section.slug if active_section else None,
"content": content,
"sections": [
{"title": sec.title, "slug": sec.slug}
for sec in self.sections
if sec.title and sec.slug
],
}
)
@dataclass(frozen=True)
class NavSection:
path: str
title: str
breadcrumb: tuple[str, ...]
children: tuple[tuple[str, str], ...]
descendant_page_count: int = 0
def breadcrumb_text(self) -> str:
return " > ".join(self.breadcrumb)
def to_mcp_dict(self) -> dict:
hint = None
if self.descendant_page_count:
hint = f"Browse {self.descendant_page_count} docs in this section."
return _compact_dict(
{
"kind": "section",
"path": self.path,
"title": self.title,
"breadcrumb": self.breadcrumb_text(),
"llm_hint": hint,
"has_children": bool(self.children),
"child_count": len(self.children),
"page_count": self.descendant_page_count,
}
)
@dataclass(frozen=True)
class DocsIndex:
pages_by_path: dict[str, DocPage]
sections_by_path: dict[str, NavSection]
def _compact_dict(data: dict[str, Any]) -> dict[str, Any]:
return {
key: value for key, value in data.items() if value not in (None, "", [], (), {})
}
def _slugify(value: str) -> str:
slug = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
return slug or "section"
def _coerce_docs_root(candidate: Path) -> Path | None:
candidate = candidate.expanduser().resolve()
if (candidate / "docs.json").is_file():
return candidate
nested = candidate / "docs"
if (nested / "docs.json").is_file():
return nested
return None
def _resolve_docs_root() -> Path | None:
"""Return the path to the on-disk docs tree, or None if not found.
Resolution order:
1. ``DOGRAH_DOCS_PATH`` env var (absolute path).
2. ``/app/docs`` the location the API Dockerfile copies docs to.
3. Walk upward from this file looking for a sibling ``docs/`` dir
(covers source-checkout / dev runs).
"""
"""Return the path to the on-disk docs tree, or None if not found."""
override = os.environ.get("DOGRAH_DOCS_PATH")
if override:
candidate = Path(override).expanduser().resolve()
if candidate.is_dir():
return candidate
resolved = _coerce_docs_root(Path(override))
if resolved is not None:
return resolved
docker_default = Path("/app/docs")
if docker_default.is_dir():
docker_default = _coerce_docs_root(Path("/app/docs"))
if docker_default is not None:
return docker_default
# Walk up from .../api/mcp_server/tools/docs_search.py looking for docs/.
for parent in Path(__file__).resolve().parents:
candidate = parent / "docs"
if candidate.is_dir():
return candidate
resolved = _coerce_docs_root(parent / "docs")
if resolved is not None:
return resolved
return None
@lru_cache(maxsize=1)
def _docs_corpus() -> tuple[tuple[str, str], ...]:
"""Load the docs corpus once per process.
Returns a tuple of ``(relative_path, file_contents)`` pairs. The
docs tree is small and read-mostly at runtime, so caching the full
text in memory is cheaper than re-reading on every search.
Cache miss is intentional when ``DOGRAH_DOCS_PATH`` flips at
startup for live edits, restart the process.
"""
root = _resolve_docs_root()
if root is None:
return ()
pairs: list[tuple[str, str]] = []
for path in sorted(root.rglob("*")):
if not path.is_file():
continue
if path.suffix.lower() not in {".mdx", ".md"}:
continue
try:
contents = path.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError):
# Skip unreadable files rather than crashing the whole tool.
continue
rel = path.relative_to(root).as_posix()
pairs.append((rel, contents))
return tuple(pairs)
def _tokenize_query(query: str) -> list[str]:
"""Split a user query into lowercased keyword terms.
Empty strings and 1-char filler terms are dropped they would
match almost every file and drown out the real signal.
"""
terms = re.findall(r"[A-Za-z0-9_]+", query.lower())
return [term for term in terms if len(term) >= 2]
def _extract_page_title(contents: str, fallback: str) -> str:
"""Pull a human-readable title for a docs page.
Mintlify pages start with a YAML frontmatter block whose ``title``
is the most authoritative title; fall back to the first ATX heading
if frontmatter is missing or malformed; fall back to the filename
if no heading exists.
"""
if contents.startswith("---"):
end = contents.find("---", 3)
if end != -1:
frontmatter = contents[3:end]
for line in frontmatter.splitlines():
line = line.strip()
if line.lower().startswith("title:"):
value = line.split(":", 1)[1].strip()
# Strip surrounding quotes if Mintlify wrote them.
if (
len(value) >= 2
and value[0] == value[-1]
and value[0] in ('"', "'")
):
value = value[1:-1]
if value:
return value
match = _HEADING_RE.search(contents)
if match:
return match.group(2).strip()
return fallback
def _split_frontmatter(contents: str) -> tuple[dict[str, Any], str]:
match = _FRONTMATTER_RE.match(contents)
if not match:
return {}, contents
try:
frontmatter = yaml.safe_load(match.group(1)) or {}
except yaml.YAMLError:
return {}, contents
if not isinstance(frontmatter, dict):
frontmatter = {}
return frontmatter, contents[match.end() :].lstrip("\n")
def _strip_frontmatter(contents: str) -> str:
"""Drop the YAML frontmatter block from a docs page body."""
if not contents.startswith("---"):
return contents
end = contents.find("---", 3)
if end == -1:
return contents
return contents[end + 3 :].lstrip("\n")
return _split_frontmatter(contents)[1]
def _build_snippet(body: str, terms: list[str], snippet_radius: int = 120) -> str:
"""Return a ~240-char window around the first term hit in ``body``.
def _clean_heading_text(raw: str) -> str:
text = re.sub(r"\s*\{#.*\}\s*$", "", raw.strip())
return " ".join(text.split())
The window is centered on the earliest match (whichever term comes
first wins) so the snippet shows context for the strongest signal,
not the lexicographically-first term. Leading/trailing newlines are
collapsed so the snippet renders cleanly through MCP's text payload.
"""
body_lower = body.lower()
earliest = -1
for term in terms:
idx = body_lower.find(term)
if idx != -1 and (earliest == -1 or idx < earliest):
earliest = idx
if earliest == -1:
# No hit in body — the match must have come from the title or
# path, so just return the first line of body as orientation.
first_line = next(
(line.strip() for line in body.splitlines() if line.strip()),
"",
def _extract_page_title(contents: str, fallback: str) -> str:
"""Pull a human-readable title for a docs page."""
frontmatter, body = _split_frontmatter(contents)
title = frontmatter.get("title")
if isinstance(title, str) and title.strip():
return title.strip()
match = _HEADING_RE.search(body)
if match:
return _clean_heading_text(match.group(2))
return fallback
def _normalize_text(value: Any) -> str:
if isinstance(value, str):
return " ".join(value.strip().split())
return ""
def _normalize_aliases(value: Any) -> tuple[str, ...]:
if isinstance(value, str):
aliases = [value]
elif isinstance(value, list):
aliases = [item for item in value if isinstance(item, str)]
else:
aliases = []
return tuple(alias.strip() for alias in aliases if alias.strip())
def _extract_sections(body: str) -> tuple[DocSection, ...]:
matches = list(_HEADING_RE.finditer(body))
stripped_body = body.strip()
if not matches:
if not stripped_body:
return ()
return (
DocSection(
title="Overview",
slug="overview",
level=1,
content=stripped_body,
),
)
return first_line[: snippet_radius * 2]
start = max(0, earliest - snippet_radius)
end = min(len(body), earliest + snippet_radius)
snippet = body[start:end]
# Collapse all whitespace runs (incl. internal newlines) for a
# single-line snippet — MCP renders text payloads inline.
snippet = " ".join(snippet.split())
prefix = "" if start > 0 else ""
suffix = "" if end < len(body) else ""
return f"{prefix}{snippet}{suffix}"
sections: list[DocSection] = []
preamble = body[: matches[0].start()].strip()
if preamble:
sections.append(
DocSection(
title="Overview",
slug="overview",
level=1,
content=preamble,
)
)
for index, match in enumerate(matches):
start = match.start()
end = matches[index + 1].start() if index + 1 < len(matches) else len(body)
title = _clean_heading_text(match.group(2))
sections.append(
DocSection(
title=title or "Section",
slug=_slugify(title or "section"),
level=len(match.group(1)),
content=body[start:end].strip(),
)
)
return tuple(sections)
def _score_page(
rel_path: str,
title: str,
body: str,
terms: list[str],
) -> int:
"""Weighted keyword score for a single docs page.
def _tokenize_text(text: str) -> list[str]:
return [
token
for token in _TOKEN_RE.findall(text.lower())
if len(token) >= 2 and token not in _STOPWORDS
]
Title/path matches outweigh body matches because they encode the
page's purpose, not just incidental mentions. Each query term
contributes independently a page matching all terms ranks above
one matching a single term many times.
"""
if not terms:
return 0
score = 0
path_lower = rel_path.lower()
title_lower = title.lower()
body_lower = body.lower()
for term in terms:
path_hits = path_lower.count(term)
title_hits = title_lower.count(term)
body_hits = body_lower.count(term)
if path_hits == 0 and title_hits == 0 and body_hits == 0:
# Penalize pages that miss any query term — they probably
# aren't what the caller wants.
def _tokenize_query(query: str) -> list[str]:
"""Split a user query into lowercased keyword terms."""
seen: set[str] = set()
terms: list[str] = []
for token in _TOKEN_RE.findall(query.lower()):
if len(token) < 2 or token in _STOPWORDS or token in seen:
continue
# Diminishing returns past a few hits per term: 1 dominant page
# shouldn't outweigh a page that hits every term. The cap is
# deliberately set so ``title_weight (5)`` strictly exceeds
# ``body_cap (4) × body_weight (1)`` — a page whose TITLE is the
# term must outrank a page that merely mentions it repeatedly.
body_hits = min(body_hits, 4)
score += path_hits * 8 + title_hits * 5 + body_hits
seen.add(token)
terms.append(token)
return terms
def _resolve_doc_file(root: Path, route_path: str) -> Path | None:
candidates = (
root / f"{route_path}.mdx",
root / f"{route_path}.md",
root / route_path / "index.mdx",
root / route_path / "index.md",
)
for candidate in candidates:
if candidate.is_file():
return candidate
return None
def _build_doc_page(
root: Path,
route_path: str,
*,
breadcrumb: tuple[str, ...],
order: int,
) -> DocPage | None:
file_path = _resolve_doc_file(root, route_path)
if file_path is None:
return None
try:
contents = file_path.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError):
return None
frontmatter, body = _split_frontmatter(contents)
fallback = route_path.rsplit("/", 1)[-1].replace("-", " ").title()
title = _extract_page_title(contents, fallback=fallback)
description = _normalize_text(frontmatter.get("description"))
llm_hint = _normalize_text(frontmatter.get("llm_hint"))
aliases = _normalize_aliases(frontmatter.get("aliases"))
content = body.strip()
return DocPage(
path=route_path,
file_path=file_path.relative_to(root).as_posix(),
title=title,
description=description,
llm_hint=llm_hint,
aliases=aliases,
breadcrumb=breadcrumb,
content=content,
sections=_extract_sections(content),
order=order,
)
def _score_counter(counter: Counter[str], term: str, *, weight: int, cap: int) -> int:
return min(counter.get(term, 0), cap) * weight
def _normalized_phrase(text: str) -> str:
return " ".join(_tokenize_text(text))
def _score_section(section: DocSection, terms: list[str]) -> int:
title_counts = Counter(_tokenize_text(section.title))
body_counts = Counter(_tokenize_text(section.content))
score = 0
matched_terms = 0
for term in terms:
term_score = _score_counter(
title_counts, term, weight=7, cap=2
) + _score_counter(body_counts, term, weight=1, cap=4)
if term_score:
matched_terms += 1
score += term_score
score += matched_terms * 4
phrase = " ".join(terms)
if phrase and phrase in _normalized_phrase(section.content):
score += 6
return score
def _docs_url_for(rel_path: str) -> str:
"""Build the public docs URL for a relative on-disk path."""
# Strip the extension and `index` so `getting-started/index.mdx`
# maps to `/getting-started`, matching Mintlify's routing.
no_ext = re.sub(r"\.(mdx|md)$", "", rel_path, flags=re.IGNORECASE)
if no_ext.endswith("/index"):
no_ext = no_ext[: -len("/index")]
return f"{DOCS_SITE_BASE_URL}/{no_ext}".rstrip("/")
def _score_page(page: DocPage, terms: list[str]) -> tuple[int, DocSection | None]:
if not terms:
return 0, None
path_counts = Counter(_tokenize_text(page.path))
title_counts = Counter(_tokenize_text(page.title))
breadcrumb_counts = Counter(_tokenize_text(" ".join(page.breadcrumb)))
hint_counts = Counter(_tokenize_text(page.routing_hint()))
alias_counts = Counter(_tokenize_text(" ".join(page.aliases)))
score = 0
matched_terms = 0
for term in terms:
term_score = (
_score_counter(path_counts, term, weight=6, cap=3)
+ _score_counter(title_counts, term, weight=10, cap=2)
+ _score_counter(breadcrumb_counts, term, weight=4, cap=2)
+ _score_counter(hint_counts, term, weight=7, cap=3)
+ _score_counter(alias_counts, term, weight=7, cap=3)
)
if term_score:
matched_terms += 1
score += term_score
best_section = None
best_section_score = 0
for section in page.sections:
section_score = _score_section(section, terms)
if section_score > best_section_score:
best_section = section
best_section_score = section_score
if score == 0 and best_section_score == 0:
return 0, None
score += matched_terms * 8 + best_section_score
phrase = " ".join(terms)
if phrase:
if phrase in _normalized_phrase(page.title):
score += 12
elif phrase in _normalized_phrase(page.routing_hint()):
score += 8
elif phrase in _normalized_phrase(page.path):
score += 8
elif best_section is not None and phrase in _normalized_phrase(
best_section.content
):
score += 4
return score, best_section
def _set_descendant_counts(
sections_by_path: dict[str, NavSection],
section_path: str,
) -> int:
section = sections_by_path[section_path]
page_count = 0
for child_kind, child_path in section.children:
if child_kind == "page":
page_count += 1
else:
page_count += _set_descendant_counts(sections_by_path, child_path)
sections_by_path[section_path] = replace(section, descendant_page_count=page_count)
return page_count
@lru_cache(maxsize=1)
def _docs_index() -> DocsIndex:
root = _resolve_docs_root()
if root is None:
return DocsIndex(pages_by_path={}, sections_by_path={})
try:
docs_config = json.loads((root / "docs.json").read_text(encoding="utf-8"))
except (OSError, UnicodeDecodeError, json.JSONDecodeError):
return DocsIndex(pages_by_path={}, sections_by_path={})
pages_by_path: dict[str, DocPage] = {}
sections_by_path: dict[str, NavSection] = {}
page_order = 0
def ensure_unique_section_path(base_path: str) -> str:
if base_path not in sections_by_path:
return base_path
suffix = 2
while f"{base_path}-{suffix}" in sections_by_path:
suffix += 1
return f"{base_path}-{suffix}"
def walk_pages(
items: list[Any],
*,
section_path: str,
section_title: str,
ancestor_breadcrumb: tuple[str, ...],
) -> None:
nonlocal page_order
children: list[tuple[str, str]] = []
page_breadcrumb = ancestor_breadcrumb + (section_title,)
for item in items:
if isinstance(item, str):
route_path = item.strip("/")
if not route_path:
continue
if route_path not in pages_by_path:
page = _build_doc_page(
root,
route_path,
breadcrumb=page_breadcrumb,
order=page_order,
)
if page is not None:
pages_by_path[route_path] = page
page_order += 1
if route_path in pages_by_path:
children.append(("page", route_path))
continue
if not isinstance(item, dict):
continue
group_title = str(item.get("group", "")).strip()
nested_pages = item.get("pages")
if not group_title or not isinstance(nested_pages, list):
continue
child_path = ensure_unique_section_path(
f"{section_path}/{_slugify(group_title)}"
)
walk_pages(
nested_pages,
section_path=child_path,
section_title=group_title,
ancestor_breadcrumb=page_breadcrumb,
)
children.append(("section", child_path))
sections_by_path[section_path] = NavSection(
path=section_path,
title=section_title,
breadcrumb=ancestor_breadcrumb,
children=tuple(children),
)
root_children: list[tuple[str, str]] = []
tabs = docs_config.get("navigation", {}).get("tabs", [])
for tab in tabs:
if not isinstance(tab, dict):
continue
tab_title = str(tab.get("tab", "")).strip() or "Docs"
for group in tab.get("groups", []):
if not isinstance(group, dict):
continue
group_title = str(group.get("group", "")).strip()
group_pages = group.get("pages")
if not group_title or not isinstance(group_pages, list):
continue
top_level_path = ensure_unique_section_path(
f"{_slugify(tab_title)}/{_slugify(group_title)}"
)
walk_pages(
group_pages,
section_path=top_level_path,
section_title=group_title,
ancestor_breadcrumb=(tab_title,),
)
root_children.append(("section", top_level_path))
sections_by_path[_ROOT_SECTION_PATH] = NavSection(
path=_ROOT_SECTION_PATH,
title="Docs",
breadcrumb=(),
children=tuple(root_children),
)
_set_descendant_counts(sections_by_path, _ROOT_SECTION_PATH)
return DocsIndex(pages_by_path=pages_by_path, sections_by_path=sections_by_path)
def _get_page_or_404(path: str) -> DocPage:
page = _docs_index().pages_by_path.get(path.strip("/"))
if page is None:
raise HTTPException(status_code=404, detail=f"Unknown docs page: {path!r}")
return page
def _find_section(page: DocPage, section: str) -> DocSection | None:
target = section.strip().lower()
for candidate in page.sections:
if candidate.slug.lower() == target or candidate.title.lower() == target:
return candidate
return None
def _expand_nav_entries(
index: DocsIndex,
section_path: str,
depth: int,
) -> list[dict]:
section = index.sections_by_path[section_path]
results: list[dict] = []
for child_kind, child_path in section.children:
if child_kind == "section":
child_section = index.sections_by_path[child_path]
results.append(child_section.to_mcp_dict())
if depth > 1:
results.extend(_expand_nav_entries(index, child_path, depth - 1))
else:
results.append(index.pages_by_path[child_path].to_catalog_dict())
return results
@traced_tool
async def search_docs(query: str, limit: int = 10) -> list[dict]:
"""Search the Dograh documentation by keyword and return ranked pages.
async def list_docs(path: str | None = None, depth: int = 1) -> list[dict]:
"""Browse the Dograh docs hierarchy before reading a page in full.
Use this when the caller asks "how do I configure X" / "where are the docs for Y" /
"what does Dograh say about Z" anything that should land on a docs page
rather than a workspace resource. For workspace data (agents, recordings,
credentials), use ``list_workflows`` / ``list_recordings`` / ``list_credentials``
instead.
Args:
query: Free-form keywords (e.g. "TURN server", "elevenlabs voice").
Tokenized on non-alphanumeric characters; terms shorter than
2 characters are dropped.
limit: Max pages to return. Capped at 25 regardless of input;
default 10 keeps the payload small enough to inline in MCP.
Returns:
Up to ``limit`` results, sorted by descending relevance score.
Each entry has:
* ``path`` repo-relative path (e.g. ``configurations/voice.mdx``)
* ``url`` public docs URL (https://docs.dograh.com/...)
* ``title`` page title (from Mintlify frontmatter when present)
* ``score`` opaque integer relevance score
* ``snippet`` ~240-char excerpt around the first term hit
``path`` addresses navigation sections exposed by this tool. Page paths
returned by ``search_docs`` and ``read_doc`` are the published docs routes
instead, for example ``voice-agent/tools/mcp-tool``.
"""
await authenticate_mcp_request()
if depth < 1 or depth > DOCS_LIST_MAX_DEPTH:
raise ValueError(f"`depth` must be between 1 and {DOCS_LIST_MAX_DEPTH}.")
index = _docs_index()
if not index.sections_by_path:
return []
if path is None:
return _expand_nav_entries(index, _ROOT_SECTION_PATH, depth)
normalized = path.strip("/")
if normalized in index.sections_by_path:
return _expand_nav_entries(index, normalized, depth)
if normalized in index.pages_by_path:
return [index.pages_by_path[normalized].to_catalog_dict()]
raise HTTPException(status_code=404, detail=f"Unknown docs section: {path!r}")
@traced_tool
async def read_doc(path: str, section: str | None = None) -> dict:
"""Read one docs page after you have narrowed to a likely match."""
await authenticate_mcp_request()
if not isinstance(path, str) or not path.strip():
raise ValueError("`path` must be a non-empty string.")
page = _get_page_or_404(path)
active_section = None
if section is not None:
active_section = _find_section(page, section)
if active_section is None:
raise HTTPException(
status_code=404,
detail=f"Unknown section {section!r} for docs page {path!r}",
)
return page.to_read_dict(section=active_section)
@traced_tool
async def search_docs(query: str, limit: int = 5) -> list[dict]:
"""Search the Dograh documentation and return a lean ranked shortlist.
Use this first for keyword or acronym lookup. Once the right page looks
likely, call ``read_doc(path)`` instead of reasoning from summaries alone.
"""
# Authentication is consistent with the rest of the MCP tools and
# routes through the same rate-limiting path, even though docs are
# not org-scoped data.
await authenticate_mcp_request()
if not isinstance(query, str) or not query.strip():
raise ValueError("query must be a non-empty string.")
try:
effective_limit = int(limit)
except (TypeError, ValueError) as exc:
raise ValueError("limit must be an integer.") from exc
if effective_limit < 1:
raise ValueError("limit must be at least 1.")
effective_limit = min(effective_limit, DOCS_SEARCH_MAX_LIMIT)
raise ValueError("`query` must be a non-empty string.")
if limit < 1:
raise ValueError("`limit` must be at least 1.")
terms = _tokenize_query(query)
if not terms:
# The caller passed something like punctuation-only or only
# single-char tokens — surface an actionable error rather than
# silently returning everything.
raise ValueError(
"query must contain at least one keyword of 2+ alphanumeric characters."
"`query` must contain at least one non-stopword alphanumeric term."
)
corpus = _docs_corpus()
if not corpus:
# Tool is registered but docs aren't on disk — return empty
# rather than 500ing so the caller can degrade gracefully.
index = _docs_index()
if not index.pages_by_path:
return []
scored: list[tuple[int, str, str, str]] = []
for rel_path, contents in corpus:
title = _extract_page_title(contents, fallback=rel_path)
body = _strip_frontmatter(contents)
score = _score_page(rel_path, title, body, terms)
capped_limit = min(limit, DOCS_SEARCH_MAX_LIMIT)
ranked: list[tuple[int, int, DocPage, DocSection | None]] = []
for page in index.pages_by_path.values():
score, best_section = _score_page(page, terms)
if score <= 0:
continue
scored.append((score, rel_path, title, body))
ranked.append((score, page.order, page, best_section))
scored.sort(key=lambda item: (-item[0], item[1]))
results: list[dict] = []
for score, rel_path, title, body in scored[:effective_limit]:
results.append(
{
"path": rel_path,
"url": _docs_url_for(rel_path),
"title": title,
"score": score,
"snippet": _build_snippet(body, terms),
}
)
return results
ranked.sort(key=lambda item: (-item[0], item[1], item[2].path))
return [
page.to_catalog_dict(section=best_section)
for _, _, page, best_section in ranked[:capped_limit]
]