mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-07 07:55:16 +02:00
feat: improve docs search
This commit is contained in:
parent
4618af20b8
commit
5c638070e0
5 changed files with 876 additions and 404 deletions
|
|
@ -16,6 +16,11 @@ You build and edit Dograh voice-AI workflows by emitting TypeScript that uses th
|
|||
|
||||
## Call order
|
||||
|
||||
### Reading documentation
|
||||
1. `search_docs(query)` — use first for keyword or acronym lookup when the user is asking how Dograh works or how to configure something.
|
||||
2. `read_doc(path)` — fetch the full page once one result looks likely. Prefer this over reasoning from search summaries alone.
|
||||
3. `list_docs(path=None, depth=1)` — use when the user wants to browse a topic area or when search terms are too vague. Returned section paths feed back into `list_docs`; returned page paths feed into `read_doc`.
|
||||
|
||||
### Editing an existing workflow
|
||||
1. `list_workflows` — locate the target workflow.
|
||||
2. `get_workflow_code(workflow_id)` — fetch the current source.
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
from fastmcp import FastMCP
|
||||
from mcp.types import ToolAnnotations
|
||||
|
||||
from api.mcp_server.instructions import DOGRAH_MCP_INSTRUCTIONS
|
||||
from api.mcp_server.tools.catalog import (
|
||||
|
|
@ -8,7 +9,7 @@ from api.mcp_server.tools.catalog import (
|
|||
list_tools,
|
||||
)
|
||||
from api.mcp_server.tools.create_workflow import create_workflow
|
||||
from api.mcp_server.tools.docs_search import search_docs
|
||||
from api.mcp_server.tools.docs_search import list_docs, read_doc, search_docs
|
||||
from api.mcp_server.tools.get_workflow_code import get_workflow_code
|
||||
from api.mcp_server.tools.node_types import get_node_type, list_node_types
|
||||
from api.mcp_server.tools.save_workflow import save_workflow
|
||||
|
|
@ -28,6 +29,15 @@ for _tool in (
|
|||
list_tools,
|
||||
list_workflows,
|
||||
save_workflow,
|
||||
search_docs,
|
||||
):
|
||||
mcp.tool(_tool)
|
||||
|
||||
_DOCS_TOOL_ANNOTATIONS = ToolAnnotations(
|
||||
readOnlyHint=True,
|
||||
idempotentHint=True,
|
||||
destructiveHint=False,
|
||||
openWorldHint=False,
|
||||
)
|
||||
|
||||
for _tool in (list_docs, read_doc, search_docs):
|
||||
mcp.tool(_tool, annotations=_DOCS_TOOL_ANNOTATIONS)
|
||||
|
|
|
|||
|
|
@ -1,312 +1,704 @@
|
|||
"""`search_docs` MCP tool — keyword search over the Mintlify docs tree.
|
||||
"""MCP docs discovery tools over the Mintlify docs tree.
|
||||
|
||||
The docs are shipped into the API image (`COPY ./docs ./docs` in
|
||||
`api/Dockerfile`), so this tool works for both source/dev runs and
|
||||
Docker deployments. For source/dev runs we walk up from this file to
|
||||
locate the `docs/` directory; for Docker we land on `/app/docs`. An
|
||||
explicit `DOGRAH_DOCS_PATH` env var overrides discovery.
|
||||
The docs surface is intentionally split into three steps:
|
||||
|
||||
The implementation is intentionally dependency-free: it does in-memory
|
||||
keyword scoring rather than building a vector index. The docs corpus is
|
||||
small (~100 .mdx files, ~140k LoC), so a per-call scan is well under
|
||||
50 ms and avoids needing an embedding backend, vector store, or
|
||||
background indexer for a tool that's called interactively from MCP.
|
||||
- ``list_docs`` for lightweight navigation over the published hierarchy
|
||||
- ``search_docs`` for keyword lookup across the visible docs catalog
|
||||
- ``read_doc`` for the full content of one chosen page (or one section)
|
||||
|
||||
The runtime index is derived from ``docs/docs.json`` plus the referenced
|
||||
``.mdx``/``.md`` files. That keeps navigation, ordering, and visibility in
|
||||
sync with the published docs rather than indexing every file under ``docs/``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass, replace
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
from fastapi import HTTPException
|
||||
|
||||
from api.mcp_server.auth import authenticate_mcp_request
|
||||
from api.mcp_server.tracing import traced_tool
|
||||
|
||||
# Public site for the rendered docs. Used to build a clickable URL per
|
||||
# result; agents can hand the URL back to the user even if the local
|
||||
# file isn't reachable.
|
||||
DOCS_SITE_BASE_URL = "https://docs.dograh.com"
|
||||
|
||||
# Hard cap regardless of caller-supplied limit. Keeps the MCP response
|
||||
# payload bounded; Mintlify search APIs use a similar 10-25 ceiling.
|
||||
DOCS_SEARCH_MAX_LIMIT = 25
|
||||
DOCS_LIST_MAX_DEPTH = 3
|
||||
_ROOT_SECTION_PATH = "__root__"
|
||||
|
||||
# Heading-detection regex. Matches ATX headings (`# `, `## `, etc.) but
|
||||
# not in-line `#` characters.
|
||||
_TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
|
||||
_FRONTMATTER_RE = re.compile(r"\A---\s*\n(.*?)\n---\s*\n?", re.DOTALL)
|
||||
_HEADING_RE = re.compile(r"^(#{1,6})\s+(.*?)\s*$", re.MULTILINE)
|
||||
_STOPWORDS = {
|
||||
"a",
|
||||
"an",
|
||||
"and",
|
||||
"are",
|
||||
"at",
|
||||
"be",
|
||||
"by",
|
||||
"can",
|
||||
"do",
|
||||
"for",
|
||||
"from",
|
||||
"how",
|
||||
"i",
|
||||
"if",
|
||||
"in",
|
||||
"is",
|
||||
"it",
|
||||
"me",
|
||||
"my",
|
||||
"of",
|
||||
"on",
|
||||
"or",
|
||||
"the",
|
||||
"to",
|
||||
"what",
|
||||
"when",
|
||||
"where",
|
||||
"with",
|
||||
"you",
|
||||
"your",
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DocSection:
|
||||
title: str
|
||||
slug: str
|
||||
level: int
|
||||
content: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DocPage:
|
||||
path: str
|
||||
file_path: str
|
||||
title: str
|
||||
description: str
|
||||
llm_hint: str
|
||||
aliases: tuple[str, ...]
|
||||
breadcrumb: tuple[str, ...]
|
||||
content: str
|
||||
sections: tuple[DocSection, ...]
|
||||
order: int
|
||||
|
||||
def breadcrumb_text(self) -> str:
|
||||
return " > ".join(self.breadcrumb)
|
||||
|
||||
def routing_hint(self) -> str:
|
||||
return self.llm_hint or self.description
|
||||
|
||||
def to_catalog_dict(self, section: DocSection | None = None) -> dict:
|
||||
data = {
|
||||
"kind": "page",
|
||||
"path": self.path,
|
||||
"title": self.title,
|
||||
"breadcrumb": self.breadcrumb_text(),
|
||||
"llm_hint": self.routing_hint(),
|
||||
}
|
||||
if section is not None:
|
||||
data["section_title"] = section.title
|
||||
data["section_slug"] = section.slug
|
||||
return _compact_dict(data)
|
||||
|
||||
def to_read_dict(self, section: DocSection | None = None) -> dict:
|
||||
active_section = section
|
||||
content = self.content
|
||||
if active_section is not None:
|
||||
content = active_section.content
|
||||
|
||||
return _compact_dict(
|
||||
{
|
||||
"path": self.path,
|
||||
"title": self.title,
|
||||
"breadcrumb": self.breadcrumb_text(),
|
||||
"llm_hint": self.routing_hint(),
|
||||
"section_title": active_section.title if active_section else None,
|
||||
"section_slug": active_section.slug if active_section else None,
|
||||
"content": content,
|
||||
"sections": [
|
||||
{"title": sec.title, "slug": sec.slug}
|
||||
for sec in self.sections
|
||||
if sec.title and sec.slug
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class NavSection:
|
||||
path: str
|
||||
title: str
|
||||
breadcrumb: tuple[str, ...]
|
||||
children: tuple[tuple[str, str], ...]
|
||||
descendant_page_count: int = 0
|
||||
|
||||
def breadcrumb_text(self) -> str:
|
||||
return " > ".join(self.breadcrumb)
|
||||
|
||||
def to_mcp_dict(self) -> dict:
|
||||
hint = None
|
||||
if self.descendant_page_count:
|
||||
hint = f"Browse {self.descendant_page_count} docs in this section."
|
||||
return _compact_dict(
|
||||
{
|
||||
"kind": "section",
|
||||
"path": self.path,
|
||||
"title": self.title,
|
||||
"breadcrumb": self.breadcrumb_text(),
|
||||
"llm_hint": hint,
|
||||
"has_children": bool(self.children),
|
||||
"child_count": len(self.children),
|
||||
"page_count": self.descendant_page_count,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DocsIndex:
|
||||
pages_by_path: dict[str, DocPage]
|
||||
sections_by_path: dict[str, NavSection]
|
||||
|
||||
|
||||
def _compact_dict(data: dict[str, Any]) -> dict[str, Any]:
|
||||
return {
|
||||
key: value for key, value in data.items() if value not in (None, "", [], (), {})
|
||||
}
|
||||
|
||||
|
||||
def _slugify(value: str) -> str:
|
||||
slug = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
|
||||
return slug or "section"
|
||||
|
||||
|
||||
def _coerce_docs_root(candidate: Path) -> Path | None:
|
||||
candidate = candidate.expanduser().resolve()
|
||||
if (candidate / "docs.json").is_file():
|
||||
return candidate
|
||||
nested = candidate / "docs"
|
||||
if (nested / "docs.json").is_file():
|
||||
return nested
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_docs_root() -> Path | None:
|
||||
"""Return the path to the on-disk docs tree, or None if not found.
|
||||
|
||||
Resolution order:
|
||||
1. ``DOGRAH_DOCS_PATH`` env var (absolute path).
|
||||
2. ``/app/docs`` — the location the API Dockerfile copies docs to.
|
||||
3. Walk upward from this file looking for a sibling ``docs/`` dir
|
||||
(covers source-checkout / dev runs).
|
||||
"""
|
||||
"""Return the path to the on-disk docs tree, or None if not found."""
|
||||
override = os.environ.get("DOGRAH_DOCS_PATH")
|
||||
if override:
|
||||
candidate = Path(override).expanduser().resolve()
|
||||
if candidate.is_dir():
|
||||
return candidate
|
||||
resolved = _coerce_docs_root(Path(override))
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
docker_default = Path("/app/docs")
|
||||
if docker_default.is_dir():
|
||||
docker_default = _coerce_docs_root(Path("/app/docs"))
|
||||
if docker_default is not None:
|
||||
return docker_default
|
||||
|
||||
# Walk up from .../api/mcp_server/tools/docs_search.py looking for docs/.
|
||||
for parent in Path(__file__).resolve().parents:
|
||||
candidate = parent / "docs"
|
||||
if candidate.is_dir():
|
||||
return candidate
|
||||
resolved = _coerce_docs_root(parent / "docs")
|
||||
if resolved is not None:
|
||||
return resolved
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _docs_corpus() -> tuple[tuple[str, str], ...]:
|
||||
"""Load the docs corpus once per process.
|
||||
|
||||
Returns a tuple of ``(relative_path, file_contents)`` pairs. The
|
||||
docs tree is small and read-mostly at runtime, so caching the full
|
||||
text in memory is cheaper than re-reading on every search.
|
||||
Cache miss is intentional when ``DOGRAH_DOCS_PATH`` flips at
|
||||
startup — for live edits, restart the process.
|
||||
"""
|
||||
root = _resolve_docs_root()
|
||||
if root is None:
|
||||
return ()
|
||||
|
||||
pairs: list[tuple[str, str]] = []
|
||||
for path in sorted(root.rglob("*")):
|
||||
if not path.is_file():
|
||||
continue
|
||||
if path.suffix.lower() not in {".mdx", ".md"}:
|
||||
continue
|
||||
try:
|
||||
contents = path.read_text(encoding="utf-8")
|
||||
except (OSError, UnicodeDecodeError):
|
||||
# Skip unreadable files rather than crashing the whole tool.
|
||||
continue
|
||||
rel = path.relative_to(root).as_posix()
|
||||
pairs.append((rel, contents))
|
||||
return tuple(pairs)
|
||||
|
||||
|
||||
def _tokenize_query(query: str) -> list[str]:
|
||||
"""Split a user query into lowercased keyword terms.
|
||||
|
||||
Empty strings and 1-char filler terms are dropped — they would
|
||||
match almost every file and drown out the real signal.
|
||||
"""
|
||||
terms = re.findall(r"[A-Za-z0-9_]+", query.lower())
|
||||
return [term for term in terms if len(term) >= 2]
|
||||
|
||||
|
||||
def _extract_page_title(contents: str, fallback: str) -> str:
|
||||
"""Pull a human-readable title for a docs page.
|
||||
|
||||
Mintlify pages start with a YAML frontmatter block whose ``title``
|
||||
is the most authoritative title; fall back to the first ATX heading
|
||||
if frontmatter is missing or malformed; fall back to the filename
|
||||
if no heading exists.
|
||||
"""
|
||||
if contents.startswith("---"):
|
||||
end = contents.find("---", 3)
|
||||
if end != -1:
|
||||
frontmatter = contents[3:end]
|
||||
for line in frontmatter.splitlines():
|
||||
line = line.strip()
|
||||
if line.lower().startswith("title:"):
|
||||
value = line.split(":", 1)[1].strip()
|
||||
# Strip surrounding quotes if Mintlify wrote them.
|
||||
if (
|
||||
len(value) >= 2
|
||||
and value[0] == value[-1]
|
||||
and value[0] in ('"', "'")
|
||||
):
|
||||
value = value[1:-1]
|
||||
if value:
|
||||
return value
|
||||
|
||||
match = _HEADING_RE.search(contents)
|
||||
if match:
|
||||
return match.group(2).strip()
|
||||
|
||||
return fallback
|
||||
def _split_frontmatter(contents: str) -> tuple[dict[str, Any], str]:
|
||||
match = _FRONTMATTER_RE.match(contents)
|
||||
if not match:
|
||||
return {}, contents
|
||||
try:
|
||||
frontmatter = yaml.safe_load(match.group(1)) or {}
|
||||
except yaml.YAMLError:
|
||||
return {}, contents
|
||||
if not isinstance(frontmatter, dict):
|
||||
frontmatter = {}
|
||||
return frontmatter, contents[match.end() :].lstrip("\n")
|
||||
|
||||
|
||||
def _strip_frontmatter(contents: str) -> str:
|
||||
"""Drop the YAML frontmatter block from a docs page body."""
|
||||
if not contents.startswith("---"):
|
||||
return contents
|
||||
end = contents.find("---", 3)
|
||||
if end == -1:
|
||||
return contents
|
||||
return contents[end + 3 :].lstrip("\n")
|
||||
return _split_frontmatter(contents)[1]
|
||||
|
||||
|
||||
def _build_snippet(body: str, terms: list[str], snippet_radius: int = 120) -> str:
|
||||
"""Return a ~240-char window around the first term hit in ``body``.
|
||||
def _clean_heading_text(raw: str) -> str:
|
||||
text = re.sub(r"\s*\{#.*\}\s*$", "", raw.strip())
|
||||
return " ".join(text.split())
|
||||
|
||||
The window is centered on the earliest match (whichever term comes
|
||||
first wins) so the snippet shows context for the strongest signal,
|
||||
not the lexicographically-first term. Leading/trailing newlines are
|
||||
collapsed so the snippet renders cleanly through MCP's text payload.
|
||||
"""
|
||||
body_lower = body.lower()
|
||||
earliest = -1
|
||||
for term in terms:
|
||||
idx = body_lower.find(term)
|
||||
if idx != -1 and (earliest == -1 or idx < earliest):
|
||||
earliest = idx
|
||||
|
||||
if earliest == -1:
|
||||
# No hit in body — the match must have come from the title or
|
||||
# path, so just return the first line of body as orientation.
|
||||
first_line = next(
|
||||
(line.strip() for line in body.splitlines() if line.strip()),
|
||||
"",
|
||||
def _extract_page_title(contents: str, fallback: str) -> str:
|
||||
"""Pull a human-readable title for a docs page."""
|
||||
frontmatter, body = _split_frontmatter(contents)
|
||||
title = frontmatter.get("title")
|
||||
if isinstance(title, str) and title.strip():
|
||||
return title.strip()
|
||||
|
||||
match = _HEADING_RE.search(body)
|
||||
if match:
|
||||
return _clean_heading_text(match.group(2))
|
||||
|
||||
return fallback
|
||||
|
||||
|
||||
def _normalize_text(value: Any) -> str:
|
||||
if isinstance(value, str):
|
||||
return " ".join(value.strip().split())
|
||||
return ""
|
||||
|
||||
|
||||
def _normalize_aliases(value: Any) -> tuple[str, ...]:
|
||||
if isinstance(value, str):
|
||||
aliases = [value]
|
||||
elif isinstance(value, list):
|
||||
aliases = [item for item in value if isinstance(item, str)]
|
||||
else:
|
||||
aliases = []
|
||||
return tuple(alias.strip() for alias in aliases if alias.strip())
|
||||
|
||||
|
||||
def _extract_sections(body: str) -> tuple[DocSection, ...]:
|
||||
matches = list(_HEADING_RE.finditer(body))
|
||||
stripped_body = body.strip()
|
||||
if not matches:
|
||||
if not stripped_body:
|
||||
return ()
|
||||
return (
|
||||
DocSection(
|
||||
title="Overview",
|
||||
slug="overview",
|
||||
level=1,
|
||||
content=stripped_body,
|
||||
),
|
||||
)
|
||||
return first_line[: snippet_radius * 2]
|
||||
|
||||
start = max(0, earliest - snippet_radius)
|
||||
end = min(len(body), earliest + snippet_radius)
|
||||
snippet = body[start:end]
|
||||
# Collapse all whitespace runs (incl. internal newlines) for a
|
||||
# single-line snippet — MCP renders text payloads inline.
|
||||
snippet = " ".join(snippet.split())
|
||||
prefix = "…" if start > 0 else ""
|
||||
suffix = "…" if end < len(body) else ""
|
||||
return f"{prefix}{snippet}{suffix}"
|
||||
sections: list[DocSection] = []
|
||||
preamble = body[: matches[0].start()].strip()
|
||||
if preamble:
|
||||
sections.append(
|
||||
DocSection(
|
||||
title="Overview",
|
||||
slug="overview",
|
||||
level=1,
|
||||
content=preamble,
|
||||
)
|
||||
)
|
||||
|
||||
for index, match in enumerate(matches):
|
||||
start = match.start()
|
||||
end = matches[index + 1].start() if index + 1 < len(matches) else len(body)
|
||||
title = _clean_heading_text(match.group(2))
|
||||
sections.append(
|
||||
DocSection(
|
||||
title=title or "Section",
|
||||
slug=_slugify(title or "section"),
|
||||
level=len(match.group(1)),
|
||||
content=body[start:end].strip(),
|
||||
)
|
||||
)
|
||||
return tuple(sections)
|
||||
|
||||
|
||||
def _score_page(
|
||||
rel_path: str,
|
||||
title: str,
|
||||
body: str,
|
||||
terms: list[str],
|
||||
) -> int:
|
||||
"""Weighted keyword score for a single docs page.
|
||||
def _tokenize_text(text: str) -> list[str]:
|
||||
return [
|
||||
token
|
||||
for token in _TOKEN_RE.findall(text.lower())
|
||||
if len(token) >= 2 and token not in _STOPWORDS
|
||||
]
|
||||
|
||||
Title/path matches outweigh body matches because they encode the
|
||||
page's purpose, not just incidental mentions. Each query term
|
||||
contributes independently — a page matching all terms ranks above
|
||||
one matching a single term many times.
|
||||
"""
|
||||
if not terms:
|
||||
return 0
|
||||
score = 0
|
||||
path_lower = rel_path.lower()
|
||||
title_lower = title.lower()
|
||||
body_lower = body.lower()
|
||||
for term in terms:
|
||||
path_hits = path_lower.count(term)
|
||||
title_hits = title_lower.count(term)
|
||||
body_hits = body_lower.count(term)
|
||||
if path_hits == 0 and title_hits == 0 and body_hits == 0:
|
||||
# Penalize pages that miss any query term — they probably
|
||||
# aren't what the caller wants.
|
||||
|
||||
def _tokenize_query(query: str) -> list[str]:
|
||||
"""Split a user query into lowercased keyword terms."""
|
||||
seen: set[str] = set()
|
||||
terms: list[str] = []
|
||||
for token in _TOKEN_RE.findall(query.lower()):
|
||||
if len(token) < 2 or token in _STOPWORDS or token in seen:
|
||||
continue
|
||||
# Diminishing returns past a few hits per term: 1 dominant page
|
||||
# shouldn't outweigh a page that hits every term. The cap is
|
||||
# deliberately set so ``title_weight (5)`` strictly exceeds
|
||||
# ``body_cap (4) × body_weight (1)`` — a page whose TITLE is the
|
||||
# term must outrank a page that merely mentions it repeatedly.
|
||||
body_hits = min(body_hits, 4)
|
||||
score += path_hits * 8 + title_hits * 5 + body_hits
|
||||
seen.add(token)
|
||||
terms.append(token)
|
||||
return terms
|
||||
|
||||
|
||||
def _resolve_doc_file(root: Path, route_path: str) -> Path | None:
|
||||
candidates = (
|
||||
root / f"{route_path}.mdx",
|
||||
root / f"{route_path}.md",
|
||||
root / route_path / "index.mdx",
|
||||
root / route_path / "index.md",
|
||||
)
|
||||
for candidate in candidates:
|
||||
if candidate.is_file():
|
||||
return candidate
|
||||
return None
|
||||
|
||||
|
||||
def _build_doc_page(
|
||||
root: Path,
|
||||
route_path: str,
|
||||
*,
|
||||
breadcrumb: tuple[str, ...],
|
||||
order: int,
|
||||
) -> DocPage | None:
|
||||
file_path = _resolve_doc_file(root, route_path)
|
||||
if file_path is None:
|
||||
return None
|
||||
try:
|
||||
contents = file_path.read_text(encoding="utf-8")
|
||||
except (OSError, UnicodeDecodeError):
|
||||
return None
|
||||
|
||||
frontmatter, body = _split_frontmatter(contents)
|
||||
fallback = route_path.rsplit("/", 1)[-1].replace("-", " ").title()
|
||||
title = _extract_page_title(contents, fallback=fallback)
|
||||
description = _normalize_text(frontmatter.get("description"))
|
||||
llm_hint = _normalize_text(frontmatter.get("llm_hint"))
|
||||
aliases = _normalize_aliases(frontmatter.get("aliases"))
|
||||
content = body.strip()
|
||||
|
||||
return DocPage(
|
||||
path=route_path,
|
||||
file_path=file_path.relative_to(root).as_posix(),
|
||||
title=title,
|
||||
description=description,
|
||||
llm_hint=llm_hint,
|
||||
aliases=aliases,
|
||||
breadcrumb=breadcrumb,
|
||||
content=content,
|
||||
sections=_extract_sections(content),
|
||||
order=order,
|
||||
)
|
||||
|
||||
|
||||
def _score_counter(counter: Counter[str], term: str, *, weight: int, cap: int) -> int:
|
||||
return min(counter.get(term, 0), cap) * weight
|
||||
|
||||
|
||||
def _normalized_phrase(text: str) -> str:
|
||||
return " ".join(_tokenize_text(text))
|
||||
|
||||
|
||||
def _score_section(section: DocSection, terms: list[str]) -> int:
|
||||
title_counts = Counter(_tokenize_text(section.title))
|
||||
body_counts = Counter(_tokenize_text(section.content))
|
||||
score = 0
|
||||
matched_terms = 0
|
||||
for term in terms:
|
||||
term_score = _score_counter(
|
||||
title_counts, term, weight=7, cap=2
|
||||
) + _score_counter(body_counts, term, weight=1, cap=4)
|
||||
if term_score:
|
||||
matched_terms += 1
|
||||
score += term_score
|
||||
score += matched_terms * 4
|
||||
|
||||
phrase = " ".join(terms)
|
||||
if phrase and phrase in _normalized_phrase(section.content):
|
||||
score += 6
|
||||
return score
|
||||
|
||||
|
||||
def _docs_url_for(rel_path: str) -> str:
|
||||
"""Build the public docs URL for a relative on-disk path."""
|
||||
# Strip the extension and `index` so `getting-started/index.mdx`
|
||||
# maps to `/getting-started`, matching Mintlify's routing.
|
||||
no_ext = re.sub(r"\.(mdx|md)$", "", rel_path, flags=re.IGNORECASE)
|
||||
if no_ext.endswith("/index"):
|
||||
no_ext = no_ext[: -len("/index")]
|
||||
return f"{DOCS_SITE_BASE_URL}/{no_ext}".rstrip("/")
|
||||
def _score_page(page: DocPage, terms: list[str]) -> tuple[int, DocSection | None]:
|
||||
if not terms:
|
||||
return 0, None
|
||||
|
||||
path_counts = Counter(_tokenize_text(page.path))
|
||||
title_counts = Counter(_tokenize_text(page.title))
|
||||
breadcrumb_counts = Counter(_tokenize_text(" ".join(page.breadcrumb)))
|
||||
hint_counts = Counter(_tokenize_text(page.routing_hint()))
|
||||
alias_counts = Counter(_tokenize_text(" ".join(page.aliases)))
|
||||
|
||||
score = 0
|
||||
matched_terms = 0
|
||||
for term in terms:
|
||||
term_score = (
|
||||
_score_counter(path_counts, term, weight=6, cap=3)
|
||||
+ _score_counter(title_counts, term, weight=10, cap=2)
|
||||
+ _score_counter(breadcrumb_counts, term, weight=4, cap=2)
|
||||
+ _score_counter(hint_counts, term, weight=7, cap=3)
|
||||
+ _score_counter(alias_counts, term, weight=7, cap=3)
|
||||
)
|
||||
if term_score:
|
||||
matched_terms += 1
|
||||
score += term_score
|
||||
|
||||
best_section = None
|
||||
best_section_score = 0
|
||||
for section in page.sections:
|
||||
section_score = _score_section(section, terms)
|
||||
if section_score > best_section_score:
|
||||
best_section = section
|
||||
best_section_score = section_score
|
||||
|
||||
if score == 0 and best_section_score == 0:
|
||||
return 0, None
|
||||
|
||||
score += matched_terms * 8 + best_section_score
|
||||
|
||||
phrase = " ".join(terms)
|
||||
if phrase:
|
||||
if phrase in _normalized_phrase(page.title):
|
||||
score += 12
|
||||
elif phrase in _normalized_phrase(page.routing_hint()):
|
||||
score += 8
|
||||
elif phrase in _normalized_phrase(page.path):
|
||||
score += 8
|
||||
elif best_section is not None and phrase in _normalized_phrase(
|
||||
best_section.content
|
||||
):
|
||||
score += 4
|
||||
|
||||
return score, best_section
|
||||
|
||||
|
||||
def _set_descendant_counts(
|
||||
sections_by_path: dict[str, NavSection],
|
||||
section_path: str,
|
||||
) -> int:
|
||||
section = sections_by_path[section_path]
|
||||
page_count = 0
|
||||
for child_kind, child_path in section.children:
|
||||
if child_kind == "page":
|
||||
page_count += 1
|
||||
else:
|
||||
page_count += _set_descendant_counts(sections_by_path, child_path)
|
||||
sections_by_path[section_path] = replace(section, descendant_page_count=page_count)
|
||||
return page_count
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _docs_index() -> DocsIndex:
|
||||
root = _resolve_docs_root()
|
||||
if root is None:
|
||||
return DocsIndex(pages_by_path={}, sections_by_path={})
|
||||
|
||||
try:
|
||||
docs_config = json.loads((root / "docs.json").read_text(encoding="utf-8"))
|
||||
except (OSError, UnicodeDecodeError, json.JSONDecodeError):
|
||||
return DocsIndex(pages_by_path={}, sections_by_path={})
|
||||
|
||||
pages_by_path: dict[str, DocPage] = {}
|
||||
sections_by_path: dict[str, NavSection] = {}
|
||||
page_order = 0
|
||||
|
||||
def ensure_unique_section_path(base_path: str) -> str:
|
||||
if base_path not in sections_by_path:
|
||||
return base_path
|
||||
suffix = 2
|
||||
while f"{base_path}-{suffix}" in sections_by_path:
|
||||
suffix += 1
|
||||
return f"{base_path}-{suffix}"
|
||||
|
||||
def walk_pages(
|
||||
items: list[Any],
|
||||
*,
|
||||
section_path: str,
|
||||
section_title: str,
|
||||
ancestor_breadcrumb: tuple[str, ...],
|
||||
) -> None:
|
||||
nonlocal page_order
|
||||
children: list[tuple[str, str]] = []
|
||||
page_breadcrumb = ancestor_breadcrumb + (section_title,)
|
||||
|
||||
for item in items:
|
||||
if isinstance(item, str):
|
||||
route_path = item.strip("/")
|
||||
if not route_path:
|
||||
continue
|
||||
if route_path not in pages_by_path:
|
||||
page = _build_doc_page(
|
||||
root,
|
||||
route_path,
|
||||
breadcrumb=page_breadcrumb,
|
||||
order=page_order,
|
||||
)
|
||||
if page is not None:
|
||||
pages_by_path[route_path] = page
|
||||
page_order += 1
|
||||
if route_path in pages_by_path:
|
||||
children.append(("page", route_path))
|
||||
continue
|
||||
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
group_title = str(item.get("group", "")).strip()
|
||||
nested_pages = item.get("pages")
|
||||
if not group_title or not isinstance(nested_pages, list):
|
||||
continue
|
||||
|
||||
child_path = ensure_unique_section_path(
|
||||
f"{section_path}/{_slugify(group_title)}"
|
||||
)
|
||||
walk_pages(
|
||||
nested_pages,
|
||||
section_path=child_path,
|
||||
section_title=group_title,
|
||||
ancestor_breadcrumb=page_breadcrumb,
|
||||
)
|
||||
children.append(("section", child_path))
|
||||
|
||||
sections_by_path[section_path] = NavSection(
|
||||
path=section_path,
|
||||
title=section_title,
|
||||
breadcrumb=ancestor_breadcrumb,
|
||||
children=tuple(children),
|
||||
)
|
||||
|
||||
root_children: list[tuple[str, str]] = []
|
||||
tabs = docs_config.get("navigation", {}).get("tabs", [])
|
||||
for tab in tabs:
|
||||
if not isinstance(tab, dict):
|
||||
continue
|
||||
tab_title = str(tab.get("tab", "")).strip() or "Docs"
|
||||
for group in tab.get("groups", []):
|
||||
if not isinstance(group, dict):
|
||||
continue
|
||||
group_title = str(group.get("group", "")).strip()
|
||||
group_pages = group.get("pages")
|
||||
if not group_title or not isinstance(group_pages, list):
|
||||
continue
|
||||
top_level_path = ensure_unique_section_path(
|
||||
f"{_slugify(tab_title)}/{_slugify(group_title)}"
|
||||
)
|
||||
walk_pages(
|
||||
group_pages,
|
||||
section_path=top_level_path,
|
||||
section_title=group_title,
|
||||
ancestor_breadcrumb=(tab_title,),
|
||||
)
|
||||
root_children.append(("section", top_level_path))
|
||||
|
||||
sections_by_path[_ROOT_SECTION_PATH] = NavSection(
|
||||
path=_ROOT_SECTION_PATH,
|
||||
title="Docs",
|
||||
breadcrumb=(),
|
||||
children=tuple(root_children),
|
||||
)
|
||||
_set_descendant_counts(sections_by_path, _ROOT_SECTION_PATH)
|
||||
|
||||
return DocsIndex(pages_by_path=pages_by_path, sections_by_path=sections_by_path)
|
||||
|
||||
|
||||
def _get_page_or_404(path: str) -> DocPage:
|
||||
page = _docs_index().pages_by_path.get(path.strip("/"))
|
||||
if page is None:
|
||||
raise HTTPException(status_code=404, detail=f"Unknown docs page: {path!r}")
|
||||
return page
|
||||
|
||||
|
||||
def _find_section(page: DocPage, section: str) -> DocSection | None:
|
||||
target = section.strip().lower()
|
||||
for candidate in page.sections:
|
||||
if candidate.slug.lower() == target or candidate.title.lower() == target:
|
||||
return candidate
|
||||
return None
|
||||
|
||||
|
||||
def _expand_nav_entries(
|
||||
index: DocsIndex,
|
||||
section_path: str,
|
||||
depth: int,
|
||||
) -> list[dict]:
|
||||
section = index.sections_by_path[section_path]
|
||||
results: list[dict] = []
|
||||
for child_kind, child_path in section.children:
|
||||
if child_kind == "section":
|
||||
child_section = index.sections_by_path[child_path]
|
||||
results.append(child_section.to_mcp_dict())
|
||||
if depth > 1:
|
||||
results.extend(_expand_nav_entries(index, child_path, depth - 1))
|
||||
else:
|
||||
results.append(index.pages_by_path[child_path].to_catalog_dict())
|
||||
return results
|
||||
|
||||
|
||||
@traced_tool
|
||||
async def search_docs(query: str, limit: int = 10) -> list[dict]:
|
||||
"""Search the Dograh documentation by keyword and return ranked pages.
|
||||
async def list_docs(path: str | None = None, depth: int = 1) -> list[dict]:
|
||||
"""Browse the Dograh docs hierarchy before reading a page in full.
|
||||
|
||||
Use this when the caller asks "how do I configure X" / "where are the docs for Y" /
|
||||
"what does Dograh say about Z" — anything that should land on a docs page
|
||||
rather than a workspace resource. For workspace data (agents, recordings,
|
||||
credentials), use ``list_workflows`` / ``list_recordings`` / ``list_credentials``
|
||||
instead.
|
||||
|
||||
Args:
|
||||
query: Free-form keywords (e.g. "TURN server", "elevenlabs voice").
|
||||
Tokenized on non-alphanumeric characters; terms shorter than
|
||||
2 characters are dropped.
|
||||
limit: Max pages to return. Capped at 25 regardless of input;
|
||||
default 10 keeps the payload small enough to inline in MCP.
|
||||
|
||||
Returns:
|
||||
Up to ``limit`` results, sorted by descending relevance score.
|
||||
Each entry has:
|
||||
* ``path`` — repo-relative path (e.g. ``configurations/voice.mdx``)
|
||||
* ``url`` — public docs URL (https://docs.dograh.com/...)
|
||||
* ``title`` — page title (from Mintlify frontmatter when present)
|
||||
* ``score`` — opaque integer relevance score
|
||||
* ``snippet`` — ~240-char excerpt around the first term hit
|
||||
``path`` addresses navigation sections exposed by this tool. Page paths
|
||||
returned by ``search_docs`` and ``read_doc`` are the published docs routes
|
||||
instead, for example ``voice-agent/tools/mcp-tool``.
|
||||
"""
|
||||
await authenticate_mcp_request()
|
||||
|
||||
if depth < 1 or depth > DOCS_LIST_MAX_DEPTH:
|
||||
raise ValueError(f"`depth` must be between 1 and {DOCS_LIST_MAX_DEPTH}.")
|
||||
|
||||
index = _docs_index()
|
||||
if not index.sections_by_path:
|
||||
return []
|
||||
|
||||
if path is None:
|
||||
return _expand_nav_entries(index, _ROOT_SECTION_PATH, depth)
|
||||
|
||||
normalized = path.strip("/")
|
||||
if normalized in index.sections_by_path:
|
||||
return _expand_nav_entries(index, normalized, depth)
|
||||
if normalized in index.pages_by_path:
|
||||
return [index.pages_by_path[normalized].to_catalog_dict()]
|
||||
|
||||
raise HTTPException(status_code=404, detail=f"Unknown docs section: {path!r}")
|
||||
|
||||
|
||||
@traced_tool
|
||||
async def read_doc(path: str, section: str | None = None) -> dict:
|
||||
"""Read one docs page after you have narrowed to a likely match."""
|
||||
await authenticate_mcp_request()
|
||||
|
||||
if not isinstance(path, str) or not path.strip():
|
||||
raise ValueError("`path` must be a non-empty string.")
|
||||
|
||||
page = _get_page_or_404(path)
|
||||
active_section = None
|
||||
if section is not None:
|
||||
active_section = _find_section(page, section)
|
||||
if active_section is None:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f"Unknown section {section!r} for docs page {path!r}",
|
||||
)
|
||||
return page.to_read_dict(section=active_section)
|
||||
|
||||
|
||||
@traced_tool
|
||||
async def search_docs(query: str, limit: int = 5) -> list[dict]:
|
||||
"""Search the Dograh documentation and return a lean ranked shortlist.
|
||||
|
||||
Use this first for keyword or acronym lookup. Once the right page looks
|
||||
likely, call ``read_doc(path)`` instead of reasoning from summaries alone.
|
||||
"""
|
||||
# Authentication is consistent with the rest of the MCP tools and
|
||||
# routes through the same rate-limiting path, even though docs are
|
||||
# not org-scoped data.
|
||||
await authenticate_mcp_request()
|
||||
|
||||
if not isinstance(query, str) or not query.strip():
|
||||
raise ValueError("query must be a non-empty string.")
|
||||
|
||||
try:
|
||||
effective_limit = int(limit)
|
||||
except (TypeError, ValueError) as exc:
|
||||
raise ValueError("limit must be an integer.") from exc
|
||||
if effective_limit < 1:
|
||||
raise ValueError("limit must be at least 1.")
|
||||
effective_limit = min(effective_limit, DOCS_SEARCH_MAX_LIMIT)
|
||||
raise ValueError("`query` must be a non-empty string.")
|
||||
if limit < 1:
|
||||
raise ValueError("`limit` must be at least 1.")
|
||||
|
||||
terms = _tokenize_query(query)
|
||||
if not terms:
|
||||
# The caller passed something like punctuation-only or only
|
||||
# single-char tokens — surface an actionable error rather than
|
||||
# silently returning everything.
|
||||
raise ValueError(
|
||||
"query must contain at least one keyword of 2+ alphanumeric characters."
|
||||
"`query` must contain at least one non-stopword alphanumeric term."
|
||||
)
|
||||
|
||||
corpus = _docs_corpus()
|
||||
if not corpus:
|
||||
# Tool is registered but docs aren't on disk — return empty
|
||||
# rather than 500ing so the caller can degrade gracefully.
|
||||
index = _docs_index()
|
||||
if not index.pages_by_path:
|
||||
return []
|
||||
|
||||
scored: list[tuple[int, str, str, str]] = []
|
||||
for rel_path, contents in corpus:
|
||||
title = _extract_page_title(contents, fallback=rel_path)
|
||||
body = _strip_frontmatter(contents)
|
||||
score = _score_page(rel_path, title, body, terms)
|
||||
capped_limit = min(limit, DOCS_SEARCH_MAX_LIMIT)
|
||||
ranked: list[tuple[int, int, DocPage, DocSection | None]] = []
|
||||
for page in index.pages_by_path.values():
|
||||
score, best_section = _score_page(page, terms)
|
||||
if score <= 0:
|
||||
continue
|
||||
scored.append((score, rel_path, title, body))
|
||||
ranked.append((score, page.order, page, best_section))
|
||||
|
||||
scored.sort(key=lambda item: (-item[0], item[1]))
|
||||
|
||||
results: list[dict] = []
|
||||
for score, rel_path, title, body in scored[:effective_limit]:
|
||||
results.append(
|
||||
{
|
||||
"path": rel_path,
|
||||
"url": _docs_url_for(rel_path),
|
||||
"title": title,
|
||||
"score": score,
|
||||
"snippet": _build_snippet(body, terms),
|
||||
}
|
||||
)
|
||||
return results
|
||||
ranked.sort(key=lambda item: (-item[0], item[1], item[2].path))
|
||||
return [
|
||||
page.to_catalog_dict(section=best_section)
|
||||
for _, _, page, best_section in ranked[:capped_limit]
|
||||
]
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import re
|
||||
from collections import Counter
|
||||
from typing import Any, Dict, List, Set
|
||||
from typing import Dict, List, Set
|
||||
|
||||
from api.services.workflow.dto import EdgeDataDTO, NodeType, ReactFlowDTO
|
||||
from api.services.workflow.errors import ItemKind, WorkflowError
|
||||
|
|
|
|||
|
|
@ -1,14 +1,4 @@
|
|||
"""Unit tests for the `search_docs` MCP tool.
|
||||
|
||||
The tool reads the docs corpus from disk via ``_resolve_docs_root`` and
|
||||
caches it with ``functools.lru_cache``. These tests point the cache at
|
||||
a synthetic corpus per-test so the assertions don't depend on the real
|
||||
docs tree (which evolves) and the LRU cache doesn't leak state.
|
||||
|
||||
`authenticate_mcp_request` is mocked so the tests don't need a live DB
|
||||
or a valid API key — mirroring the pattern in
|
||||
``test_mcp_save_workflow.py``.
|
||||
"""
|
||||
"""Unit tests for the MCP docs discovery tools."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
|
@ -17,71 +7,152 @@ from pathlib import Path
|
|||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
import pytest
|
||||
from fastapi import HTTPException
|
||||
|
||||
from api.mcp_server.tools import docs_search as docs_search_module
|
||||
from api.mcp_server.tools.docs_search import (
|
||||
_docs_url_for,
|
||||
_docs_index,
|
||||
_extract_page_title,
|
||||
_resolve_docs_root,
|
||||
_score_page,
|
||||
_strip_frontmatter,
|
||||
_tokenize_query,
|
||||
list_docs,
|
||||
read_doc,
|
||||
search_docs,
|
||||
)
|
||||
|
||||
|
||||
# ─── Fixtures ────────────────────────────────────────────────────────────
|
||||
def _clear_docs_caches() -> None:
|
||||
docs_search_module._docs_index.cache_clear()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fake_docs_root(tmp_path: Path) -> Path:
|
||||
"""Build a minimal docs tree on disk and point the tool at it."""
|
||||
docs_root = tmp_path / "docs"
|
||||
docs_root.mkdir()
|
||||
|
||||
(docs_root / "configurations").mkdir()
|
||||
(docs_root / "configurations" / "voice.mdx").write_text(
|
||||
(docs_root / "getting-started").mkdir()
|
||||
(docs_root / "getting-started" / "index.mdx").write_text(
|
||||
"---\n"
|
||||
'title: "Voice"\n'
|
||||
'title: "Getting started"\n'
|
||||
'description: "Start using Dograh."\n'
|
||||
"---\n\n"
|
||||
"# Voice configuration\n\n"
|
||||
"Dograh supports ElevenLabs and Cartesia TTS providers.\n"
|
||||
"Configure the ElevenLabs voice_id in your workspace settings.\n",
|
||||
"# Getting started\n\n"
|
||||
"Welcome to Dograh.\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
(docs_root / "configurations" / "transcriber.mdx").write_text(
|
||||
|
||||
(docs_root / "voice-agent").mkdir()
|
||||
(docs_root / "voice-agent" / "introduction.mdx").write_text(
|
||||
"---\n"
|
||||
'title: "Transcriber"\n'
|
||||
'title: "Voice Agent Builder"\n'
|
||||
'description: "Build conversational workflows."\n'
|
||||
"---\n\n"
|
||||
"# Speech-to-text\n\nDeepgram is the default transcriber.\n",
|
||||
"# Voice Agent Builder\n\n"
|
||||
"Build workflows with nodes and tools.\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
(docs_root / "voice-agent" / "tools").mkdir()
|
||||
(docs_root / "voice-agent" / "tools" / "mcp-tool.mdx").write_text(
|
||||
"---\n"
|
||||
'title: "MCP Tool"\n'
|
||||
'description: "Connect external MCP servers."\n'
|
||||
'llm_hint: "Use for MCP server setup, remote tools, or model context protocol questions."\n'
|
||||
"aliases:\n"
|
||||
' - "model context protocol"\n'
|
||||
"---\n\n"
|
||||
"# MCP Tool\n\n"
|
||||
"Connect an external MCP server to your voice agent.\n\n"
|
||||
"## Authentication\n\n"
|
||||
"Provide the MCP endpoint URL and headers.\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
(docs_root / "deployment").mkdir()
|
||||
(docs_root / "deployment" / "turn-server.mdx").write_text(
|
||||
(docs_root / "deployment" / "docker.mdx").write_text(
|
||||
"---\n"
|
||||
'title: "TURN server setup"\n'
|
||||
'title: "Docker"\n'
|
||||
'description: "Deploy Dograh with Docker."\n'
|
||||
'llm_hint: "Use for Docker deployment, local setup, remote setup, TURN server, coturn, or WebRTC connectivity questions."\n'
|
||||
"aliases:\n"
|
||||
' - "coturn"\n'
|
||||
' - "turn server"\n'
|
||||
"---\n\n"
|
||||
"# TURN server\n\n"
|
||||
"WebRTC requires a TURN server for NAT traversal. Coturn is the "
|
||||
"recommended choice for self-hosted deployments.\n",
|
||||
"# Docker\n\n"
|
||||
"Run Dograh with Docker.\n\n"
|
||||
"## Troubleshooting WebRTC Connectivity\n\n"
|
||||
"If audio fails or ICE fails, configure a TURN server. Coturn is the recommended choice.\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
# A non-doc file that must be ignored by the corpus loader.
|
||||
(docs_root / "docs.json").write_text('{"name":"Dograh"}', encoding="utf-8")
|
||||
# Hidden/orphaned docs page: present on disk but not in docs.json, so it
|
||||
# must not be indexed by the MCP tools.
|
||||
(docs_root / "internal-only.mdx").write_text(
|
||||
"---\n"
|
||||
'title: "Internal TURN Notes"\n'
|
||||
"---\n\n"
|
||||
"# Internal TURN Notes\n\n"
|
||||
"This page mentions zyxinternalturntoken but is not user-facing.\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
# Reset the LRU cache and pin the resolver to our tmp tree.
|
||||
docs_search_module._docs_corpus.cache_clear()
|
||||
(docs_root / "AGENTS.md").write_text("# Internal instructions\n", encoding="utf-8")
|
||||
|
||||
(docs_root / "docs.json").write_text(
|
||||
"""{
|
||||
"navigation": {
|
||||
"tabs": [
|
||||
{
|
||||
"tab": "Guides",
|
||||
"groups": [
|
||||
{
|
||||
"group": "Getting started",
|
||||
"pages": [
|
||||
"getting-started/index"
|
||||
]
|
||||
},
|
||||
{
|
||||
"group": "Voice Agent Builder",
|
||||
"pages": [
|
||||
"voice-agent/introduction",
|
||||
{
|
||||
"group": "Tools",
|
||||
"pages": [
|
||||
"voice-agent/tools/mcp-tool"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"tab": "Developer",
|
||||
"groups": [
|
||||
{
|
||||
"group": "Deployment",
|
||||
"pages": [
|
||||
"deployment/docker"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
""",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
_clear_docs_caches()
|
||||
with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(docs_root)}):
|
||||
yield docs_root
|
||||
docs_search_module._docs_corpus.cache_clear()
|
||||
_clear_docs_caches()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def authed_user():
|
||||
"""Stub ``authenticate_mcp_request`` so tests skip the API-key path."""
|
||||
|
||||
class _FakeUser:
|
||||
selected_organization_id = 1
|
||||
id = 42
|
||||
|
|
@ -93,18 +164,8 @@ def authed_user():
|
|||
yield _FakeUser()
|
||||
|
||||
|
||||
# ─── Pure helpers ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_tokenize_query_strips_short_and_punct_terms():
|
||||
"""Punctuation and 1-char tokens must not bleed into the scorer.
|
||||
|
||||
A trailing `?` or stray `a` would otherwise match nearly every page
|
||||
and flatten the relevance ranking.
|
||||
"""
|
||||
assert _tokenize_query("How do I configure a TURN server?") == [
|
||||
"how",
|
||||
"do",
|
||||
def test_tokenize_query_dedupes_and_drops_stopwords():
|
||||
assert _tokenize_query("How do I configure a TURN server TURN?") == [
|
||||
"configure",
|
||||
"turn",
|
||||
"server",
|
||||
|
|
@ -121,155 +182,92 @@ def test_strip_frontmatter_removes_yaml_block():
|
|||
assert _strip_frontmatter(body).startswith("# Heading")
|
||||
|
||||
|
||||
def test_strip_frontmatter_passes_through_when_missing():
|
||||
body = "# Just a heading\nbody text\n"
|
||||
assert _strip_frontmatter(body) == body
|
||||
|
||||
|
||||
def test_extract_page_title_prefers_frontmatter():
|
||||
body = '---\ntitle: "Front Title"\n---\n\n# Heading Title\n'
|
||||
assert _extract_page_title(body, fallback="x.mdx") == "Front Title"
|
||||
|
||||
|
||||
def test_extract_page_title_falls_back_to_first_heading():
|
||||
"""When frontmatter is missing the first ATX heading is the next best
|
||||
signal — better than just returning the filename, which often is
|
||||
a slug not a human-readable title."""
|
||||
body = "# Heading Title\nbody\n"
|
||||
assert _extract_page_title(body, fallback="x.mdx") == "Heading Title"
|
||||
|
||||
|
||||
def test_extract_page_title_falls_back_to_filename_when_nothing_matches():
|
||||
body = "plain prose with no heading or frontmatter"
|
||||
assert _extract_page_title(body, fallback="x.mdx") == "x.mdx"
|
||||
|
||||
|
||||
def test_docs_url_for_strips_extension_and_index():
|
||||
assert (
|
||||
_docs_url_for("configurations/voice.mdx")
|
||||
== "https://docs.dograh.com/configurations/voice"
|
||||
)
|
||||
assert (
|
||||
_docs_url_for("getting-started/index.mdx")
|
||||
== "https://docs.dograh.com/getting-started"
|
||||
)
|
||||
|
||||
|
||||
def test_score_page_weights_title_above_body():
|
||||
"""Title hits must outweigh body hits — otherwise a long page that
|
||||
incidentally mentions the term many times outranks the page whose
|
||||
purpose IS the term."""
|
||||
title_only = _score_page(
|
||||
rel_path="other.mdx", title="TURN server", body="unrelated text", terms=["turn"]
|
||||
)
|
||||
body_only = _score_page(
|
||||
rel_path="other.mdx",
|
||||
title="Unrelated",
|
||||
body="turn turn turn turn turn",
|
||||
terms=["turn"],
|
||||
)
|
||||
assert title_only > body_only
|
||||
|
||||
|
||||
def test_score_page_returns_zero_when_no_terms_match():
|
||||
assert (
|
||||
_score_page(
|
||||
rel_path="x.mdx", title="X", body="hello world", terms=["nonexistent"]
|
||||
)
|
||||
== 0
|
||||
def test_score_page_uses_llm_hint_and_aliases():
|
||||
page = docs_search_module.DocPage(
|
||||
path="deployment/docker",
|
||||
file_path="deployment/docker.mdx",
|
||||
title="Docker",
|
||||
description="Deploy Dograh with Docker.",
|
||||
llm_hint="Use for TURN server and coturn setup.",
|
||||
aliases=("coturn",),
|
||||
breadcrumb=("Developer", "Deployment"),
|
||||
content="Docker deployment.",
|
||||
sections=(
|
||||
docs_search_module.DocSection(
|
||||
title="Troubleshooting WebRTC Connectivity",
|
||||
slug="troubleshooting-webrtc-connectivity",
|
||||
level=2,
|
||||
content="Configure a TURN server with coturn.",
|
||||
),
|
||||
),
|
||||
order=0,
|
||||
)
|
||||
score, section = _score_page(page, ["coturn"])
|
||||
assert score > 0
|
||||
assert section is not None
|
||||
assert section.slug == "troubleshooting-webrtc-connectivity"
|
||||
|
||||
|
||||
def test_resolve_docs_root_honors_env_override(tmp_path: Path):
|
||||
docs = tmp_path / "custom_docs"
|
||||
docs.mkdir()
|
||||
(docs / "docs.json").write_text("{}", encoding="utf-8")
|
||||
with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(docs)}):
|
||||
assert _resolve_docs_root() == docs.resolve()
|
||||
|
||||
|
||||
def test_resolve_docs_root_ignores_nonexistent_env_value(tmp_path: Path):
|
||||
"""A bogus env value must not crash the tool — fall back to discovery
|
||||
(the real ``docs/`` in the repo) instead."""
|
||||
with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(tmp_path / "nope")}):
|
||||
# Walk-up discovery should land somewhere (the repo's actual docs)
|
||||
# but we don't assert the exact path because it depends on where
|
||||
# the tests are run; we just assert no crash and either None or a dir.
|
||||
resolved = _resolve_docs_root()
|
||||
assert resolved is None or resolved.is_dir()
|
||||
|
||||
|
||||
# ─── End-to-end tool behaviour ───────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_docs_ranks_turn_setup_first_for_turn_query(
|
||||
async def test_search_docs_ranks_turn_doc_and_uses_route_path(
|
||||
fake_docs_root, authed_user
|
||||
):
|
||||
"""The page whose title and body are both about TURN must outrank
|
||||
incidental mentions of related words on other pages."""
|
||||
results = await search_docs("How do I set up a TURN server?")
|
||||
assert results, "expected at least one result"
|
||||
assert results[0]["path"] == "deployment/turn-server.mdx"
|
||||
assert results[0]["url"] == "https://docs.dograh.com/deployment/turn-server"
|
||||
assert "TURN server" in results[0]["title"]
|
||||
assert "TURN" in results[0]["snippet"] or "turn" in results[0]["snippet"].lower()
|
||||
results = await search_docs("How do I configure coturn for WebRTC?")
|
||||
assert results
|
||||
assert results[0]["path"] == "deployment/docker"
|
||||
assert results[0]["section_slug"] == "troubleshooting-webrtc-connectivity"
|
||||
assert "TURN server" in results[0]["llm_hint"]
|
||||
assert "snippet" not in results[0]
|
||||
assert "score" not in results[0]
|
||||
assert "url" not in results[0]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_docs_excludes_non_doc_files(fake_docs_root, authed_user):
|
||||
"""``docs.json`` must not appear — the corpus loader filters to
|
||||
.mdx/.md only."""
|
||||
results = await search_docs("Dograh")
|
||||
paths = [r["path"] for r in results]
|
||||
assert "docs.json" not in paths
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_docs_returns_empty_when_no_match(fake_docs_root, authed_user):
|
||||
results = await search_docs("xyzzy unrelated zzz")
|
||||
async def test_search_docs_indexes_only_docs_json_pages(fake_docs_root, authed_user):
|
||||
results = await search_docs("zyxinternalturntoken")
|
||||
assert results == []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_docs_respects_limit(fake_docs_root, authed_user):
|
||||
"""``limit=1`` must collapse the result list even if multiple pages
|
||||
match."""
|
||||
results = await search_docs("Dograh", limit=1)
|
||||
results = await search_docs("dograh", limit=1)
|
||||
assert len(results) == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_docs_clamps_limit_to_hard_cap(fake_docs_root, authed_user):
|
||||
"""A pathological large limit must be clamped to
|
||||
``DOCS_SEARCH_MAX_LIMIT`` (=25) so the payload stays bounded."""
|
||||
# Drop in extra docs so there's headroom to verify the clamp.
|
||||
for i in range(30):
|
||||
(fake_docs_root / f"extra-{i}.mdx").write_text(
|
||||
f"# Page {i}\nThis Dograh page covers configurations topic {i}.\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
docs_search_module._docs_corpus.cache_clear()
|
||||
results = await search_docs("Dograh", limit=999)
|
||||
assert len(results) <= 25
|
||||
async def test_search_docs_returns_empty_when_no_match(fake_docs_root, authed_user):
|
||||
assert await search_docs("xyzzy unrelated zzz") == []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_docs_returns_empty_when_no_corpus(
|
||||
tmp_path, authed_user, monkeypatch
|
||||
):
|
||||
"""If the docs directory doesn't exist on disk, the tool must
|
||||
degrade to an empty list rather than raising — Docker images and
|
||||
dev checkouts can disagree on layout."""
|
||||
nonexistent = tmp_path / "no-docs-here"
|
||||
monkeypatch.setenv("DOGRAH_DOCS_PATH", str(nonexistent))
|
||||
# Also block the walk-up fallback by pointing the resolver at a
|
||||
# tmp path with no `docs/` ancestor.
|
||||
docs_search_module._docs_corpus.cache_clear()
|
||||
_clear_docs_caches()
|
||||
with patch(
|
||||
"api.mcp_server.tools.docs_search._resolve_docs_root", return_value=None
|
||||
):
|
||||
results = await search_docs("anything")
|
||||
assert results == []
|
||||
assert await search_docs("anything") == []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
|
@ -279,16 +277,83 @@ async def test_search_docs_rejects_empty_query(fake_docs_root, authed_user):
|
|||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_docs_rejects_query_with_no_real_terms(
|
||||
async def test_search_docs_rejects_query_with_only_stopwords(
|
||||
fake_docs_root, authed_user
|
||||
):
|
||||
"""A query like `"???"` tokenizes to nothing — surface an actionable
|
||||
error rather than silently returning every page."""
|
||||
with pytest.raises(ValueError, match="2\\+ alphanumeric"):
|
||||
await search_docs("?? // !!")
|
||||
with pytest.raises(ValueError, match="non-stopword"):
|
||||
await search_docs("how do I")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_docs_rejects_zero_limit(fake_docs_root, authed_user):
|
||||
with pytest.raises(ValueError, match="at least 1"):
|
||||
await search_docs("Dograh", limit=0)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_list_docs_returns_top_level_sections(fake_docs_root, authed_user):
|
||||
results = await list_docs()
|
||||
assert results[0]["kind"] == "section"
|
||||
assert results[0]["path"] == "guides/getting-started"
|
||||
assert results[1]["path"] == "guides/voice-agent-builder"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_list_docs_depth_expands_children(fake_docs_root, authed_user):
|
||||
results = await list_docs("guides/voice-agent-builder", depth=2)
|
||||
paths = [item["path"] for item in results]
|
||||
assert "voice-agent/introduction" in paths
|
||||
assert "guides/voice-agent-builder/tools" in paths
|
||||
assert "voice-agent/tools/mcp-tool" in paths
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_list_docs_rejects_unknown_section(fake_docs_root, authed_user):
|
||||
with pytest.raises(HTTPException, match="Unknown docs section"):
|
||||
await list_docs("nope")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_read_doc_returns_full_page_and_sections(fake_docs_root, authed_user):
|
||||
result = await read_doc("deployment/docker")
|
||||
assert result["path"] == "deployment/docker"
|
||||
assert result["title"] == "Docker"
|
||||
assert "url" not in result
|
||||
section_slugs = [section["slug"] for section in result["sections"]]
|
||||
assert "docker" in section_slugs
|
||||
assert "troubleshooting-webrtc-connectivity" in section_slugs
|
||||
assert "Coturn" in result["content"] or "coturn" in result["content"].lower()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_read_doc_can_target_section(fake_docs_root, authed_user):
|
||||
result = await read_doc(
|
||||
"deployment/docker",
|
||||
section="troubleshooting-webrtc-connectivity",
|
||||
)
|
||||
assert result["section_slug"] == "troubleshooting-webrtc-connectivity"
|
||||
assert "ICE fails" in result["content"] or "TURN server" in result["content"]
|
||||
assert "Run Dograh with Docker." not in result["content"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_read_doc_rejects_unknown_page(fake_docs_root, authed_user):
|
||||
with pytest.raises(HTTPException, match="Unknown docs page"):
|
||||
await read_doc("missing/page")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_read_doc_rejects_unknown_section(fake_docs_root, authed_user):
|
||||
with pytest.raises(HTTPException, match="Unknown section"):
|
||||
await read_doc("deployment/docker", section="missing-section")
|
||||
|
||||
|
||||
def test_docs_index_uses_docs_json_navigation(fake_docs_root):
|
||||
index = _docs_index()
|
||||
assert "internal-only" not in index.pages_by_path
|
||||
assert "guides/voice-agent-builder/tools" in index.sections_by_path
|
||||
assert index.pages_by_path["voice-agent/tools/mcp-tool"].breadcrumb == (
|
||||
"Guides",
|
||||
"Voice Agent Builder",
|
||||
"Tools",
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue