dograh/api/mcp_server/tools/docs_search.py

"""`search_docs` MCP tool — keyword search over the Mintlify docs tree.

The docs are shipped into the API image (`COPY ./docs ./docs` in
`api/Dockerfile`), so this tool works for both source/dev runs and
Docker deployments. For source/dev runs we walk up from this file to
locate the `docs/` directory; for Docker we land on `/app/docs`. An
explicit `DOGRAH_DOCS_PATH` env var overrides discovery.

The implementation is intentionally dependency-free: it does in-memory
keyword scoring rather than building a vector index. The docs corpus is
small (~100 .mdx files, ~140k LoC), so a per-call scan is well under
50 ms and avoids needing an embedding backend, vector store, or
background indexer for a tool that's called interactively from MCP.
"""

from __future__ import annotations

import os
import re
from functools import lru_cache
from pathlib import Path

from api.mcp_server.auth import authenticate_mcp_request
from api.mcp_server.tracing import traced_tool

# Public site for the rendered docs. Used to build a clickable URL per
# result; agents can hand the URL back to the user even if the local
# file isn't reachable.
DOCS_SITE_BASE_URL = "https://docs.dograh.com"

# Hard cap regardless of caller-supplied limit. Keeps the MCP response
# payload bounded; Mintlify search APIs use a similar 10-25 ceiling.
DOCS_SEARCH_MAX_LIMIT = 25

# Heading-detection regex. Matches ATX headings (`# `, `## `, etc.) but
# not in-line `#` characters.
_HEADING_RE = re.compile(r"^(#{1,6})\s+(.*?)\s*$", re.MULTILINE)


def _resolve_docs_root() -> Path | None:
    """Return the path to the on-disk docs tree, or None if not found.

    Resolution order:
    1. ``DOGRAH_DOCS_PATH`` env var (absolute path).
    2. ``/app/docs`` — the location the API Dockerfile copies docs to.
    3. Walk upward from this file looking for a sibling ``docs/`` dir
       (covers source-checkout / dev runs).
    """
    override = os.environ.get("DOGRAH_DOCS_PATH")
    if override:
        candidate = Path(override).expanduser().resolve()
        if candidate.is_dir():
            return candidate

    docker_default = Path("/app/docs")
    if docker_default.is_dir():
        return docker_default

    # Walk up from .../api/mcp_server/tools/docs_search.py looking for docs/.
    for parent in Path(__file__).resolve().parents:
        candidate = parent / "docs"
        if candidate.is_dir():
            return candidate

    return None


@lru_cache(maxsize=1)
def _docs_corpus() -> tuple[tuple[str, str], ...]:
    """Load the docs corpus once per process.

    Returns a tuple of ``(relative_path, file_contents)`` pairs. The
    docs tree is small and read-mostly at runtime, so caching the full
    text in memory is cheaper than re-reading on every search.
    Cache miss is intentional when ``DOGRAH_DOCS_PATH`` flips at
    startup — for live edits, restart the process.
    """
    root = _resolve_docs_root()
    if root is None:
        return ()

    pairs: list[tuple[str, str]] = []
    for path in sorted(root.rglob("*")):
        if not path.is_file():
            continue
        if path.suffix.lower() not in {".mdx", ".md"}:
            continue
        try:
            contents = path.read_text(encoding="utf-8")
        except (OSError, UnicodeDecodeError):
            # Skip unreadable files rather than crashing the whole tool.
            continue
        rel = path.relative_to(root).as_posix()
        pairs.append((rel, contents))
    return tuple(pairs)


def _tokenize_query(query: str) -> list[str]:
    """Split a user query into lowercased keyword terms.

    Empty strings and 1-char filler terms are dropped — they would
    match almost every file and drown out the real signal.
    """
    terms = re.findall(r"[A-Za-z0-9_]+", query.lower())
    return [term for term in terms if len(term) >= 2]


def _extract_page_title(contents: str, fallback: str) -> str:
    """Pull a human-readable title for a docs page.

    Mintlify pages start with a YAML frontmatter block whose ``title``
    is the most authoritative title; fall back to the first ATX heading
    if frontmatter is missing or malformed; fall back to the filename
    if no heading exists.
    """
    if contents.startswith("---"):
        end = contents.find("---", 3)
        if end != -1:
            frontmatter = contents[3:end]
            for line in frontmatter.splitlines():
                line = line.strip()
                if line.lower().startswith("title:"):
                    value = line.split(":", 1)[1].strip()
                    # Strip surrounding quotes if Mintlify wrote them.
                    if (
                        len(value) >= 2
                        and value[0] == value[-1]
                        and value[0] in ('"', "'")
                    ):
                        value = value[1:-1]
                    if value:
                        return value

    match = _HEADING_RE.search(contents)
    if match:
        return match.group(2).strip()

    return fallback


def _strip_frontmatter(contents: str) -> str:
    """Drop the YAML frontmatter block from a docs page body."""
    if not contents.startswith("---"):
        return contents
    end = contents.find("---", 3)
    if end == -1:
        return contents
    return contents[end + 3 :].lstrip("\n")


def _build_snippet(body: str, terms: list[str], snippet_radius: int = 120) -> str:
    """Return a ~240-char window around the first term hit in ``body``.

    The window is centered on the earliest match (whichever term comes
    first wins) so the snippet shows context for the strongest signal,
    not the lexicographically-first term. Leading/trailing newlines are
    collapsed so the snippet renders cleanly through MCP's text payload.
    """
    body_lower = body.lower()
    earliest = -1
    for term in terms:
        idx = body_lower.find(term)
        if idx != -1 and (earliest == -1 or idx < earliest):
            earliest = idx

    if earliest == -1:
        # No hit in body — the match must have come from the title or
        # path, so just return the first line of body as orientation.
        first_line = next(
            (line.strip() for line in body.splitlines() if line.strip()),
            "",
        )
        return first_line[: snippet_radius * 2]

    start = max(0, earliest - snippet_radius)
    end = min(len(body), earliest + snippet_radius)
    snippet = body[start:end]
    # Collapse all whitespace runs (incl. internal newlines) for a
    # single-line snippet — MCP renders text payloads inline.
    snippet = " ".join(snippet.split())
    prefix = "…" if start > 0 else ""
    suffix = "…" if end < len(body) else ""
    return f"{prefix}{snippet}{suffix}"


def _score_page(
    rel_path: str,
    title: str,
    body: str,
    terms: list[str],
) -> int:
    """Weighted keyword score for a single docs page.

    Title/path matches outweigh body matches because they encode the
    page's purpose, not just incidental mentions. Each query term
    contributes independently — a page matching all terms ranks above
    one matching a single term many times.
    """
    if not terms:
        return 0
    score = 0
    path_lower = rel_path.lower()
    title_lower = title.lower()
    body_lower = body.lower()
    for term in terms:
        path_hits = path_lower.count(term)
        title_hits = title_lower.count(term)
        body_hits = body_lower.count(term)
        if path_hits == 0 and title_hits == 0 and body_hits == 0:
            # Penalize pages that miss any query term — they probably
            # aren't what the caller wants.
            continue
        # Diminishing returns past a few hits per term: 1 dominant page
        # shouldn't outweigh a page that hits every term. The cap is
        # deliberately set so ``title_weight (5)`` strictly exceeds
        # ``body_cap (4) × body_weight (1)`` — a page whose TITLE is the
        # term must outrank a page that merely mentions it repeatedly.
        body_hits = min(body_hits, 4)
        score += path_hits * 8 + title_hits * 5 + body_hits
    return score


def _docs_url_for(rel_path: str) -> str:
    """Build the public docs URL for a relative on-disk path."""
    # Strip the extension and `index` so `getting-started/index.mdx`
    # maps to `/getting-started`, matching Mintlify's routing.
    no_ext = re.sub(r"\.(mdx|md)$", "", rel_path, flags=re.IGNORECASE)
    if no_ext.endswith("/index"):
        no_ext = no_ext[: -len("/index")]
    return f"{DOCS_SITE_BASE_URL}/{no_ext}".rstrip("/")


@traced_tool
async def search_docs(query: str, limit: int = 10) -> list[dict]:
    """Search the Dograh documentation by keyword and return ranked pages.

    Use this when the caller asks "how do I configure X" / "where are the docs for Y" /
    "what does Dograh say about Z" — anything that should land on a docs page
    rather than a workspace resource. For workspace data (agents, recordings,
    credentials), use ``list_workflows`` / ``list_recordings`` / ``list_credentials``
    instead.

    Args:
        query: Free-form keywords (e.g. "TURN server", "elevenlabs voice").
            Tokenized on non-alphanumeric characters; terms shorter than
            2 characters are dropped.
        limit: Max pages to return. Capped at 25 regardless of input;
            default 10 keeps the payload small enough to inline in MCP.

    Returns:
        Up to ``limit`` results, sorted by descending relevance score.
        Each entry has:
          * ``path`` — repo-relative path (e.g. ``configurations/voice.mdx``)
          * ``url`` — public docs URL (https://docs.dograh.com/...)
          * ``title`` — page title (from Mintlify frontmatter when present)
          * ``score`` — opaque integer relevance score
          * ``snippet`` — ~240-char excerpt around the first term hit
    """
    # Authentication is consistent with the rest of the MCP tools and
    # routes through the same rate-limiting path, even though docs are
    # not org-scoped data.
    await authenticate_mcp_request()

    if not isinstance(query, str) or not query.strip():
        raise ValueError("query must be a non-empty string.")

    try:
        effective_limit = int(limit)
    except (TypeError, ValueError) as exc:
        raise ValueError("limit must be an integer.") from exc
    if effective_limit < 1:
        raise ValueError("limit must be at least 1.")
    effective_limit = min(effective_limit, DOCS_SEARCH_MAX_LIMIT)

    terms = _tokenize_query(query)
    if not terms:
        # The caller passed something like punctuation-only or only
        # single-char tokens — surface an actionable error rather than
        # silently returning everything.
        raise ValueError(
            "query must contain at least one keyword of 2+ alphanumeric characters."
        )

    corpus = _docs_corpus()
    if not corpus:
        # Tool is registered but docs aren't on disk — return empty
        # rather than 500ing so the caller can degrade gracefully.
        return []

    scored: list[tuple[int, str, str, str]] = []
    for rel_path, contents in corpus:
        title = _extract_page_title(contents, fallback=rel_path)
        body = _strip_frontmatter(contents)
        score = _score_page(rel_path, title, body, terms)
        if score <= 0:
            continue
        scored.append((score, rel_path, title, body))

    scored.sort(key=lambda item: (-item[0], item[1]))

    results: list[dict] = []
    for score, rel_path, title, body in scored[:effective_limit]:
        results.append(
            {
                "path": rel_path,
                "url": _docs_url_for(rel_path),
                "title": title,
                "score": score,
                "snippet": _build_snippet(body, terms),
            }
        )
    return results