mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-22 08:38:13 +02:00
313 lines
12 KiB
Python
313 lines
12 KiB
Python
|
|
"""`search_docs` MCP tool — keyword search over the Mintlify docs tree.
|
|||
|
|
|
|||
|
|
The docs are shipped into the API image (`COPY ./docs ./docs` in
|
|||
|
|
`api/Dockerfile`), so this tool works for both source/dev runs and
|
|||
|
|
Docker deployments. For source/dev runs we walk up from this file to
|
|||
|
|
locate the `docs/` directory; for Docker we land on `/app/docs`. An
|
|||
|
|
explicit `DOGRAH_DOCS_PATH` env var overrides discovery.
|
|||
|
|
|
|||
|
|
The implementation is intentionally dependency-free: it does in-memory
|
|||
|
|
keyword scoring rather than building a vector index. The docs corpus is
|
|||
|
|
small (~100 .mdx files, ~140k LoC), so a per-call scan is well under
|
|||
|
|
50 ms and avoids needing an embedding backend, vector store, or
|
|||
|
|
background indexer for a tool that's called interactively from MCP.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import re
|
|||
|
|
from functools import lru_cache
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
from api.mcp_server.auth import authenticate_mcp_request
|
|||
|
|
from api.mcp_server.tracing import traced_tool
|
|||
|
|
|
|||
|
|
# Public site for the rendered docs. Used to build a clickable URL per
|
|||
|
|
# result; agents can hand the URL back to the user even if the local
|
|||
|
|
# file isn't reachable.
|
|||
|
|
DOCS_SITE_BASE_URL = "https://docs.dograh.com"
|
|||
|
|
|
|||
|
|
# Hard cap regardless of caller-supplied limit. Keeps the MCP response
|
|||
|
|
# payload bounded; Mintlify search APIs use a similar 10-25 ceiling.
|
|||
|
|
DOCS_SEARCH_MAX_LIMIT = 25
|
|||
|
|
|
|||
|
|
# Heading-detection regex. Matches ATX headings (`# `, `## `, etc.) but
|
|||
|
|
# not in-line `#` characters.
|
|||
|
|
_HEADING_RE = re.compile(r"^(#{1,6})\s+(.*?)\s*$", re.MULTILINE)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _resolve_docs_root() -> Path | None:
|
|||
|
|
"""Return the path to the on-disk docs tree, or None if not found.
|
|||
|
|
|
|||
|
|
Resolution order:
|
|||
|
|
1. ``DOGRAH_DOCS_PATH`` env var (absolute path).
|
|||
|
|
2. ``/app/docs`` — the location the API Dockerfile copies docs to.
|
|||
|
|
3. Walk upward from this file looking for a sibling ``docs/`` dir
|
|||
|
|
(covers source-checkout / dev runs).
|
|||
|
|
"""
|
|||
|
|
override = os.environ.get("DOGRAH_DOCS_PATH")
|
|||
|
|
if override:
|
|||
|
|
candidate = Path(override).expanduser().resolve()
|
|||
|
|
if candidate.is_dir():
|
|||
|
|
return candidate
|
|||
|
|
|
|||
|
|
docker_default = Path("/app/docs")
|
|||
|
|
if docker_default.is_dir():
|
|||
|
|
return docker_default
|
|||
|
|
|
|||
|
|
# Walk up from .../api/mcp_server/tools/docs_search.py looking for docs/.
|
|||
|
|
for parent in Path(__file__).resolve().parents:
|
|||
|
|
candidate = parent / "docs"
|
|||
|
|
if candidate.is_dir():
|
|||
|
|
return candidate
|
|||
|
|
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
|
|||
|
|
@lru_cache(maxsize=1)
|
|||
|
|
def _docs_corpus() -> tuple[tuple[str, str], ...]:
|
|||
|
|
"""Load the docs corpus once per process.
|
|||
|
|
|
|||
|
|
Returns a tuple of ``(relative_path, file_contents)`` pairs. The
|
|||
|
|
docs tree is small and read-mostly at runtime, so caching the full
|
|||
|
|
text in memory is cheaper than re-reading on every search.
|
|||
|
|
Cache miss is intentional when ``DOGRAH_DOCS_PATH`` flips at
|
|||
|
|
startup — for live edits, restart the process.
|
|||
|
|
"""
|
|||
|
|
root = _resolve_docs_root()
|
|||
|
|
if root is None:
|
|||
|
|
return ()
|
|||
|
|
|
|||
|
|
pairs: list[tuple[str, str]] = []
|
|||
|
|
for path in sorted(root.rglob("*")):
|
|||
|
|
if not path.is_file():
|
|||
|
|
continue
|
|||
|
|
if path.suffix.lower() not in {".mdx", ".md"}:
|
|||
|
|
continue
|
|||
|
|
try:
|
|||
|
|
contents = path.read_text(encoding="utf-8")
|
|||
|
|
except (OSError, UnicodeDecodeError):
|
|||
|
|
# Skip unreadable files rather than crashing the whole tool.
|
|||
|
|
continue
|
|||
|
|
rel = path.relative_to(root).as_posix()
|
|||
|
|
pairs.append((rel, contents))
|
|||
|
|
return tuple(pairs)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _tokenize_query(query: str) -> list[str]:
|
|||
|
|
"""Split a user query into lowercased keyword terms.
|
|||
|
|
|
|||
|
|
Empty strings and 1-char filler terms are dropped — they would
|
|||
|
|
match almost every file and drown out the real signal.
|
|||
|
|
"""
|
|||
|
|
terms = re.findall(r"[A-Za-z0-9_]+", query.lower())
|
|||
|
|
return [term for term in terms if len(term) >= 2]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _extract_page_title(contents: str, fallback: str) -> str:
|
|||
|
|
"""Pull a human-readable title for a docs page.
|
|||
|
|
|
|||
|
|
Mintlify pages start with a YAML frontmatter block whose ``title``
|
|||
|
|
is the most authoritative title; fall back to the first ATX heading
|
|||
|
|
if frontmatter is missing or malformed; fall back to the filename
|
|||
|
|
if no heading exists.
|
|||
|
|
"""
|
|||
|
|
if contents.startswith("---"):
|
|||
|
|
end = contents.find("---", 3)
|
|||
|
|
if end != -1:
|
|||
|
|
frontmatter = contents[3:end]
|
|||
|
|
for line in frontmatter.splitlines():
|
|||
|
|
line = line.strip()
|
|||
|
|
if line.lower().startswith("title:"):
|
|||
|
|
value = line.split(":", 1)[1].strip()
|
|||
|
|
# Strip surrounding quotes if Mintlify wrote them.
|
|||
|
|
if (
|
|||
|
|
len(value) >= 2
|
|||
|
|
and value[0] == value[-1]
|
|||
|
|
and value[0] in ('"', "'")
|
|||
|
|
):
|
|||
|
|
value = value[1:-1]
|
|||
|
|
if value:
|
|||
|
|
return value
|
|||
|
|
|
|||
|
|
match = _HEADING_RE.search(contents)
|
|||
|
|
if match:
|
|||
|
|
return match.group(2).strip()
|
|||
|
|
|
|||
|
|
return fallback
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _strip_frontmatter(contents: str) -> str:
|
|||
|
|
"""Drop the YAML frontmatter block from a docs page body."""
|
|||
|
|
if not contents.startswith("---"):
|
|||
|
|
return contents
|
|||
|
|
end = contents.find("---", 3)
|
|||
|
|
if end == -1:
|
|||
|
|
return contents
|
|||
|
|
return contents[end + 3 :].lstrip("\n")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _build_snippet(body: str, terms: list[str], snippet_radius: int = 120) -> str:
|
|||
|
|
"""Return a ~240-char window around the first term hit in ``body``.
|
|||
|
|
|
|||
|
|
The window is centered on the earliest match (whichever term comes
|
|||
|
|
first wins) so the snippet shows context for the strongest signal,
|
|||
|
|
not the lexicographically-first term. Leading/trailing newlines are
|
|||
|
|
collapsed so the snippet renders cleanly through MCP's text payload.
|
|||
|
|
"""
|
|||
|
|
body_lower = body.lower()
|
|||
|
|
earliest = -1
|
|||
|
|
for term in terms:
|
|||
|
|
idx = body_lower.find(term)
|
|||
|
|
if idx != -1 and (earliest == -1 or idx < earliest):
|
|||
|
|
earliest = idx
|
|||
|
|
|
|||
|
|
if earliest == -1:
|
|||
|
|
# No hit in body — the match must have come from the title or
|
|||
|
|
# path, so just return the first line of body as orientation.
|
|||
|
|
first_line = next(
|
|||
|
|
(line.strip() for line in body.splitlines() if line.strip()),
|
|||
|
|
"",
|
|||
|
|
)
|
|||
|
|
return first_line[: snippet_radius * 2]
|
|||
|
|
|
|||
|
|
start = max(0, earliest - snippet_radius)
|
|||
|
|
end = min(len(body), earliest + snippet_radius)
|
|||
|
|
snippet = body[start:end]
|
|||
|
|
# Collapse all whitespace runs (incl. internal newlines) for a
|
|||
|
|
# single-line snippet — MCP renders text payloads inline.
|
|||
|
|
snippet = " ".join(snippet.split())
|
|||
|
|
prefix = "…" if start > 0 else ""
|
|||
|
|
suffix = "…" if end < len(body) else ""
|
|||
|
|
return f"{prefix}{snippet}{suffix}"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _score_page(
|
|||
|
|
rel_path: str,
|
|||
|
|
title: str,
|
|||
|
|
body: str,
|
|||
|
|
terms: list[str],
|
|||
|
|
) -> int:
|
|||
|
|
"""Weighted keyword score for a single docs page.
|
|||
|
|
|
|||
|
|
Title/path matches outweigh body matches because they encode the
|
|||
|
|
page's purpose, not just incidental mentions. Each query term
|
|||
|
|
contributes independently — a page matching all terms ranks above
|
|||
|
|
one matching a single term many times.
|
|||
|
|
"""
|
|||
|
|
if not terms:
|
|||
|
|
return 0
|
|||
|
|
score = 0
|
|||
|
|
path_lower = rel_path.lower()
|
|||
|
|
title_lower = title.lower()
|
|||
|
|
body_lower = body.lower()
|
|||
|
|
for term in terms:
|
|||
|
|
path_hits = path_lower.count(term)
|
|||
|
|
title_hits = title_lower.count(term)
|
|||
|
|
body_hits = body_lower.count(term)
|
|||
|
|
if path_hits == 0 and title_hits == 0 and body_hits == 0:
|
|||
|
|
# Penalize pages that miss any query term — they probably
|
|||
|
|
# aren't what the caller wants.
|
|||
|
|
continue
|
|||
|
|
# Diminishing returns past a few hits per term: 1 dominant page
|
|||
|
|
# shouldn't outweigh a page that hits every term. The cap is
|
|||
|
|
# deliberately set so ``title_weight (5)`` strictly exceeds
|
|||
|
|
# ``body_cap (4) × body_weight (1)`` — a page whose TITLE is the
|
|||
|
|
# term must outrank a page that merely mentions it repeatedly.
|
|||
|
|
body_hits = min(body_hits, 4)
|
|||
|
|
score += path_hits * 8 + title_hits * 5 + body_hits
|
|||
|
|
return score
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _docs_url_for(rel_path: str) -> str:
|
|||
|
|
"""Build the public docs URL for a relative on-disk path."""
|
|||
|
|
# Strip the extension and `index` so `getting-started/index.mdx`
|
|||
|
|
# maps to `/getting-started`, matching Mintlify's routing.
|
|||
|
|
no_ext = re.sub(r"\.(mdx|md)$", "", rel_path, flags=re.IGNORECASE)
|
|||
|
|
if no_ext.endswith("/index"):
|
|||
|
|
no_ext = no_ext[: -len("/index")]
|
|||
|
|
return f"{DOCS_SITE_BASE_URL}/{no_ext}".rstrip("/")
|
|||
|
|
|
|||
|
|
|
|||
|
|
@traced_tool
|
|||
|
|
async def search_docs(query: str, limit: int = 10) -> list[dict]:
|
|||
|
|
"""Search the Dograh documentation by keyword and return ranked pages.
|
|||
|
|
|
|||
|
|
Use this when the caller asks "how do I configure X" / "where are the docs for Y" /
|
|||
|
|
"what does Dograh say about Z" — anything that should land on a docs page
|
|||
|
|
rather than a workspace resource. For workspace data (agents, recordings,
|
|||
|
|
credentials), use ``list_workflows`` / ``list_recordings`` / ``list_credentials``
|
|||
|
|
instead.
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
query: Free-form keywords (e.g. "TURN server", "elevenlabs voice").
|
|||
|
|
Tokenized on non-alphanumeric characters; terms shorter than
|
|||
|
|
2 characters are dropped.
|
|||
|
|
limit: Max pages to return. Capped at 25 regardless of input;
|
|||
|
|
default 10 keeps the payload small enough to inline in MCP.
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
Up to ``limit`` results, sorted by descending relevance score.
|
|||
|
|
Each entry has:
|
|||
|
|
* ``path`` — repo-relative path (e.g. ``configurations/voice.mdx``)
|
|||
|
|
* ``url`` — public docs URL (https://docs.dograh.com/...)
|
|||
|
|
* ``title`` — page title (from Mintlify frontmatter when present)
|
|||
|
|
* ``score`` — opaque integer relevance score
|
|||
|
|
* ``snippet`` — ~240-char excerpt around the first term hit
|
|||
|
|
"""
|
|||
|
|
# Authentication is consistent with the rest of the MCP tools and
|
|||
|
|
# routes through the same rate-limiting path, even though docs are
|
|||
|
|
# not org-scoped data.
|
|||
|
|
await authenticate_mcp_request()
|
|||
|
|
|
|||
|
|
if not isinstance(query, str) or not query.strip():
|
|||
|
|
raise ValueError("query must be a non-empty string.")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
effective_limit = int(limit)
|
|||
|
|
except (TypeError, ValueError) as exc:
|
|||
|
|
raise ValueError("limit must be an integer.") from exc
|
|||
|
|
if effective_limit < 1:
|
|||
|
|
raise ValueError("limit must be at least 1.")
|
|||
|
|
effective_limit = min(effective_limit, DOCS_SEARCH_MAX_LIMIT)
|
|||
|
|
|
|||
|
|
terms = _tokenize_query(query)
|
|||
|
|
if not terms:
|
|||
|
|
# The caller passed something like punctuation-only or only
|
|||
|
|
# single-char tokens — surface an actionable error rather than
|
|||
|
|
# silently returning everything.
|
|||
|
|
raise ValueError(
|
|||
|
|
"query must contain at least one keyword of 2+ alphanumeric characters."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
corpus = _docs_corpus()
|
|||
|
|
if not corpus:
|
|||
|
|
# Tool is registered but docs aren't on disk — return empty
|
|||
|
|
# rather than 500ing so the caller can degrade gracefully.
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
scored: list[tuple[int, str, str, str]] = []
|
|||
|
|
for rel_path, contents in corpus:
|
|||
|
|
title = _extract_page_title(contents, fallback=rel_path)
|
|||
|
|
body = _strip_frontmatter(contents)
|
|||
|
|
score = _score_page(rel_path, title, body, terms)
|
|||
|
|
if score <= 0:
|
|||
|
|
continue
|
|||
|
|
scored.append((score, rel_path, title, body))
|
|||
|
|
|
|||
|
|
scored.sort(key=lambda item: (-item[0], item[1]))
|
|||
|
|
|
|||
|
|
results: list[dict] = []
|
|||
|
|
for score, rel_path, title, body in scored[:effective_limit]:
|
|||
|
|
results.append(
|
|||
|
|
{
|
|||
|
|
"path": rel_path,
|
|||
|
|
"url": _docs_url_for(rel_path),
|
|||
|
|
"title": title,
|
|||
|
|
"score": score,
|
|||
|
|
"snippet": _build_snippet(body, terms),
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
return results
|