feat(mcp): add search_docs tool over Mintlify docs corpus

Closes #295. The docs at https://docs.dograh.com promise "Search the Dograh docs for how to configure a TURN server" as an MCP example prompt, but no search_docs tool exists in the MCP server — agents can list workspace resources but cannot search the documentation. This adds a dependency-free, in-process keyword search over the `docs/` tree shipped into the API image (`COPY ./docs ./docs`): - New `api/mcp_server/tools/docs_search.py` — async `search_docs(query, limit=10)` with weighted scoring (path > title > body), a 25-result hard cap, snippet extraction around the first term hit, and graceful empty-list degradation when docs aren't on disk. `DOGRAH_DOCS_PATH` env var overrides location discovery for non-Docker layouts. - Registered in `api/mcp_server/server.py` alongside the other tools, keeping the existing list-alphabetical convention. - `api/tests/test_mcp_docs_search.py` — 18 unit tests covering the pure helpers (tokenizer, frontmatter stripping, title extraction, scoring weights, URL building) and end-to-end ranking, limit clamping, empty-corpus degradation, and input-validation errors. Mocks `authenticate_mcp_request` to avoid the DB dependency, mirroring `test_mcp_save_workflow.py`. Implementation notes: - The docs corpus is ~100 files / ~140k LoC, so a per-call scan runs well under 50 ms; avoiding a vector index / embedding backend keeps the tool zero-dependency and works for fully offline self-hosted deployments. - Authentication is required for consistency with the other MCP tools (and to route through the existing rate-limit middleware), even though docs are not org-scoped data. - Title/path matches deliberately outweigh body matches so a page whose subject IS the query term outranks one that merely mentions it incidentally.
2026-06-10 08:05:22 +02:00 · 2026-05-19 09:59:24 +08:00 · 2026-05-19 09:59:24 +08:00 · 6d3c18975f
commit 6d3c18975f
parent 0097974444
3 changed files with 608 additions and 0 deletions
--- a/api/mcp_server/server.py
+++ b/api/mcp_server/server.py
@ -8,6 +8,7 @@ from api.mcp_server.tools.catalog import (
    list_tools,
 )
 from api.mcp_server.tools.create_workflow import create_workflow
+from api.mcp_server.tools.docs_search import search_docs
 from api.mcp_server.tools.get_workflow_code import get_workflow_code
 from api.mcp_server.tools.node_types import get_node_type, list_node_types
 from api.mcp_server.tools.save_workflow import save_workflow
@ -27,5 +28,6 @@ for _tool in (
    list_tools,
    list_workflows,
    save_workflow,
+    search_docs,
 ):
    mcp.tool(_tool)
--- a/api/mcp_server/tools/docs_search.py
+++ b/api/mcp_server/tools/docs_search.py
@ -0,0 +1,312 @@
+"""`search_docs` MCP tool — keyword search over the Mintlify docs tree.
+
+The docs are shipped into the API image (`COPY ./docs ./docs` in
+`api/Dockerfile`), so this tool works for both source/dev runs and
+Docker deployments. For source/dev runs we walk up from this file to
+locate the `docs/` directory; for Docker we land on `/app/docs`. An
+explicit `DOGRAH_DOCS_PATH` env var overrides discovery.
+
+The implementation is intentionally dependency-free: it does in-memory
+keyword scoring rather than building a vector index. The docs corpus is
+small (~100 .mdx files, ~140k LoC), so a per-call scan is well under
+50 ms and avoids needing an embedding backend, vector store, or
+background indexer for a tool that's called interactively from MCP.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+from functools import lru_cache
+from pathlib import Path
+
+from api.mcp_server.auth import authenticate_mcp_request
+from api.mcp_server.tracing import traced_tool
+
+# Public site for the rendered docs. Used to build a clickable URL per
+# result; agents can hand the URL back to the user even if the local
+# file isn't reachable.
+DOCS_SITE_BASE_URL = "https://docs.dograh.com"
+
+# Hard cap regardless of caller-supplied limit. Keeps the MCP response
+# payload bounded; Mintlify search APIs use a similar 10-25 ceiling.
+DOCS_SEARCH_MAX_LIMIT = 25
+
+# Heading-detection regex. Matches ATX headings (`# `, `## `, etc.) but
+# not in-line `#` characters.
+_HEADING_RE = re.compile(r"^(#{1,6})\s+(.*?)\s*$", re.MULTILINE)
+
+
+def _resolve_docs_root() -> Path | None:
+    """Return the path to the on-disk docs tree, or None if not found.
+
+    Resolution order:
+    1. ``DOGRAH_DOCS_PATH`` env var (absolute path).
+    2. ``/app/docs`` — the location the API Dockerfile copies docs to.
+    3. Walk upward from this file looking for a sibling ``docs/`` dir
+       (covers source-checkout / dev runs).
+    """
+    override = os.environ.get("DOGRAH_DOCS_PATH")
+    if override:
+        candidate = Path(override).expanduser().resolve()
+        if candidate.is_dir():
+            return candidate
+
+    docker_default = Path("/app/docs")
+    if docker_default.is_dir():
+        return docker_default
+
+    # Walk up from .../api/mcp_server/tools/docs_search.py looking for docs/.
+    for parent in Path(__file__).resolve().parents:
+        candidate = parent / "docs"
+        if candidate.is_dir():
+            return candidate
+
+    return None
+
+
+@lru_cache(maxsize=1)
+def _docs_corpus() -> tuple[tuple[str, str], ...]:
+    """Load the docs corpus once per process.
+
+    Returns a tuple of ``(relative_path, file_contents)`` pairs. The
+    docs tree is small and read-mostly at runtime, so caching the full
+    text in memory is cheaper than re-reading on every search.
+    Cache miss is intentional when ``DOGRAH_DOCS_PATH`` flips at
+    startup — for live edits, restart the process.
+    """
+    root = _resolve_docs_root()
+    if root is None:
+        return ()
+
+    pairs: list[tuple[str, str]] = []
+    for path in sorted(root.rglob("*")):
+        if not path.is_file():
+            continue
+        if path.suffix.lower() not in {".mdx", ".md"}:
+            continue
+        try:
+            contents = path.read_text(encoding="utf-8")
+        except (OSError, UnicodeDecodeError):
+            # Skip unreadable files rather than crashing the whole tool.
+            continue
+        rel = path.relative_to(root).as_posix()
+        pairs.append((rel, contents))
+    return tuple(pairs)
+
+
+def _tokenize_query(query: str) -> list[str]:
+    """Split a user query into lowercased keyword terms.
+
+    Empty strings and 1-char filler terms are dropped — they would
+    match almost every file and drown out the real signal.
+    """
+    terms = re.findall(r"[A-Za-z0-9_]+", query.lower())
+    return [term for term in terms if len(term) >= 2]
+
+
+def _extract_page_title(contents: str, fallback: str) -> str:
+    """Pull a human-readable title for a docs page.
+
+    Mintlify pages start with a YAML frontmatter block whose ``title``
+    is the most authoritative title; fall back to the first ATX heading
+    if frontmatter is missing or malformed; fall back to the filename
+    if no heading exists.
+    """
+    if contents.startswith("---"):
+        end = contents.find("---", 3)
+        if end != -1:
+            frontmatter = contents[3:end]
+            for line in frontmatter.splitlines():
+                line = line.strip()
+                if line.lower().startswith("title:"):
+                    value = line.split(":", 1)[1].strip()
+                    # Strip surrounding quotes if Mintlify wrote them.
+                    if (
+                        len(value) >= 2
+                        and value[0] == value[-1]
+                        and value[0] in ('"', "'")
+                    ):
+                        value = value[1:-1]
+                    if value:
+                        return value
+
+    match = _HEADING_RE.search(contents)
+    if match:
+        return match.group(2).strip()
+
+    return fallback
+
+
+def _strip_frontmatter(contents: str) -> str:
+    """Drop the YAML frontmatter block from a docs page body."""
+    if not contents.startswith("---"):
+        return contents
+    end = contents.find("---", 3)
+    if end == -1:
+        return contents
+    return contents[end + 3 :].lstrip("\n")
+
+
+def _build_snippet(body: str, terms: list[str], snippet_radius: int = 120) -> str:
+    """Return a ~240-char window around the first term hit in ``body``.
+
+    The window is centered on the earliest match (whichever term comes
+    first wins) so the snippet shows context for the strongest signal,
+    not the lexicographically-first term. Leading/trailing newlines are
+    collapsed so the snippet renders cleanly through MCP's text payload.
+    """
+    body_lower = body.lower()
+    earliest = -1
+    for term in terms:
+        idx = body_lower.find(term)
+        if idx != -1 and (earliest == -1 or idx < earliest):
+            earliest = idx
+
+    if earliest == -1:
+        # No hit in body — the match must have come from the title or
+        # path, so just return the first line of body as orientation.
+        first_line = next(
+            (line.strip() for line in body.splitlines() if line.strip()),
+            "",
+        )
+        return first_line[: snippet_radius * 2]
+
+    start = max(0, earliest - snippet_radius)
+    end = min(len(body), earliest + snippet_radius)
+    snippet = body[start:end]
+    # Collapse all whitespace runs (incl. internal newlines) for a
+    # single-line snippet — MCP renders text payloads inline.
+    snippet = " ".join(snippet.split())
+    prefix = "…" if start > 0 else ""
+    suffix = "…" if end < len(body) else ""
+    return f"{prefix}{snippet}{suffix}"
+
+
+def _score_page(
+    rel_path: str,
+    title: str,
+    body: str,
+    terms: list[str],
+) -> int:
+    """Weighted keyword score for a single docs page.
+
+    Title/path matches outweigh body matches because they encode the
+    page's purpose, not just incidental mentions. Each query term
+    contributes independently — a page matching all terms ranks above
+    one matching a single term many times.
+    """
+    if not terms:
+        return 0
+    score = 0
+    path_lower = rel_path.lower()
+    title_lower = title.lower()
+    body_lower = body.lower()
+    for term in terms:
+        path_hits = path_lower.count(term)
+        title_hits = title_lower.count(term)
+        body_hits = body_lower.count(term)
+        if path_hits == 0 and title_hits == 0 and body_hits == 0:
+            # Penalize pages that miss any query term — they probably
+            # aren't what the caller wants.
+            continue
+        # Diminishing returns past a few hits per term: 1 dominant page
+        # shouldn't outweigh a page that hits every term. The cap is
+        # deliberately set so ``title_weight (5)`` strictly exceeds
+        # ``body_cap (4) × body_weight (1)`` — a page whose TITLE is the
+        # term must outrank a page that merely mentions it repeatedly.
+        body_hits = min(body_hits, 4)
+        score += path_hits * 8 + title_hits * 5 + body_hits
+    return score
+
+
+def _docs_url_for(rel_path: str) -> str:
+    """Build the public docs URL for a relative on-disk path."""
+    # Strip the extension and `index` so `getting-started/index.mdx`
+    # maps to `/getting-started`, matching Mintlify's routing.
+    no_ext = re.sub(r"\.(mdx|md)$", "", rel_path, flags=re.IGNORECASE)
+    if no_ext.endswith("/index"):
+        no_ext = no_ext[: -len("/index")]
+    return f"{DOCS_SITE_BASE_URL}/{no_ext}".rstrip("/")
+
+
+@traced_tool
+async def search_docs(query: str, limit: int = 10) -> list[dict]:
+    """Search the Dograh documentation by keyword and return ranked pages.
+
+    Use this when the caller asks "how do I configure X" / "where are the docs for Y" /
+    "what does Dograh say about Z" — anything that should land on a docs page
+    rather than a workspace resource. For workspace data (agents, recordings,
+    credentials), use ``list_workflows`` / ``list_recordings`` / ``list_credentials``
+    instead.
+
+    Args:
+        query: Free-form keywords (e.g. "TURN server", "elevenlabs voice").
+            Tokenized on non-alphanumeric characters; terms shorter than
+            2 characters are dropped.
+        limit: Max pages to return. Capped at 25 regardless of input;
+            default 10 keeps the payload small enough to inline in MCP.
+
+    Returns:
+        Up to ``limit`` results, sorted by descending relevance score.
+        Each entry has:
+          * ``path`` — repo-relative path (e.g. ``configurations/voice.mdx``)
+          * ``url`` — public docs URL (https://docs.dograh.com/...)
+          * ``title`` — page title (from Mintlify frontmatter when present)
+          * ``score`` — opaque integer relevance score
+          * ``snippet`` — ~240-char excerpt around the first term hit
+    """
+    # Authentication is consistent with the rest of the MCP tools and
+    # routes through the same rate-limiting path, even though docs are
+    # not org-scoped data.
+    await authenticate_mcp_request()
+
+    if not isinstance(query, str) or not query.strip():
+        raise ValueError("query must be a non-empty string.")
+
+    try:
+        effective_limit = int(limit)
+    except (TypeError, ValueError) as exc:
+        raise ValueError("limit must be an integer.") from exc
+    if effective_limit < 1:
+        raise ValueError("limit must be at least 1.")
+    effective_limit = min(effective_limit, DOCS_SEARCH_MAX_LIMIT)
+
+    terms = _tokenize_query(query)
+    if not terms:
+        # The caller passed something like punctuation-only or only
+        # single-char tokens — surface an actionable error rather than
+        # silently returning everything.
+        raise ValueError(
+            "query must contain at least one keyword of 2+ alphanumeric characters."
+        )
+
+    corpus = _docs_corpus()
+    if not corpus:
+        # Tool is registered but docs aren't on disk — return empty
+        # rather than 500ing so the caller can degrade gracefully.
+        return []
+
+    scored: list[tuple[int, str, str, str]] = []
+    for rel_path, contents in corpus:
+        title = _extract_page_title(contents, fallback=rel_path)
+        body = _strip_frontmatter(contents)
+        score = _score_page(rel_path, title, body, terms)
+        if score <= 0:
+            continue
+        scored.append((score, rel_path, title, body))
+
+    scored.sort(key=lambda item: (-item[0], item[1]))
+
+    results: list[dict] = []
+    for score, rel_path, title, body in scored[:effective_limit]:
+        results.append(
+            {
+                "path": rel_path,
+                "url": _docs_url_for(rel_path),
+                "title": title,
+                "score": score,
+                "snippet": _build_snippet(body, terms),
+            }
+        )
+    return results
--- a/api/tests/test_mcp_docs_search.py
+++ b/api/tests/test_mcp_docs_search.py
@ -0,0 +1,294 @@
+"""Unit tests for the `search_docs` MCP tool.
+
+The tool reads the docs corpus from disk via ``_resolve_docs_root`` and
+caches it with ``functools.lru_cache``. These tests point the cache at
+a synthetic corpus per-test so the assertions don't depend on the real
+docs tree (which evolves) and the LRU cache doesn't leak state.
+
+`authenticate_mcp_request` is mocked so the tests don't need a live DB
+or a valid API key — mirroring the pattern in
+``test_mcp_save_workflow.py``.
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from api.mcp_server.tools import docs_search as docs_search_module
+from api.mcp_server.tools.docs_search import (
+    _docs_url_for,
+    _extract_page_title,
+    _resolve_docs_root,
+    _score_page,
+    _strip_frontmatter,
+    _tokenize_query,
+    search_docs,
+)
+
+
+# ─── Fixtures ────────────────────────────────────────────────────────────
+
+
+@pytest.fixture
+def fake_docs_root(tmp_path: Path) -> Path:
+    """Build a minimal docs tree on disk and point the tool at it."""
+    docs_root = tmp_path / "docs"
+    docs_root.mkdir()
+
+    (docs_root / "configurations").mkdir()
+    (docs_root / "configurations" / "voice.mdx").write_text(
+        "---\n"
+        'title: "Voice"\n'
+        "---\n\n"
+        "# Voice configuration\n\n"
+        "Dograh supports ElevenLabs and Cartesia TTS providers.\n"
+        "Configure the ElevenLabs voice_id in your workspace settings.\n",
+        encoding="utf-8",
+    )
+    (docs_root / "configurations" / "transcriber.mdx").write_text(
+        "---\n"
+        'title: "Transcriber"\n'
+        "---\n\n"
+        "# Speech-to-text\n\nDeepgram is the default transcriber.\n",
+        encoding="utf-8",
+    )
+
+    (docs_root / "deployment").mkdir()
+    (docs_root / "deployment" / "turn-server.mdx").write_text(
+        "---\n"
+        'title: "TURN server setup"\n'
+        "---\n\n"
+        "# TURN server\n\n"
+        "WebRTC requires a TURN server for NAT traversal. Coturn is the "
+        "recommended choice for self-hosted deployments.\n",
+        encoding="utf-8",
+    )
+
+    # A non-doc file that must be ignored by the corpus loader.
+    (docs_root / "docs.json").write_text('{"name":"Dograh"}', encoding="utf-8")
+
+    # Reset the LRU cache and pin the resolver to our tmp tree.
+    docs_search_module._docs_corpus.cache_clear()
+    with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(docs_root)}):
+        yield docs_root
+    docs_search_module._docs_corpus.cache_clear()
+
+
+@pytest.fixture
+def authed_user():
+    """Stub ``authenticate_mcp_request`` so tests skip the API-key path."""
+
+    class _FakeUser:
+        selected_organization_id = 1
+        id = 42
+
+    with patch(
+        "api.mcp_server.tools.docs_search.authenticate_mcp_request",
+        new=AsyncMock(return_value=_FakeUser()),
+    ):
+        yield _FakeUser()
+
+
+# ─── Pure helpers ────────────────────────────────────────────────────────
+
+
+def test_tokenize_query_strips_short_and_punct_terms():
+    """Punctuation and 1-char tokens must not bleed into the scorer.
+
+    A trailing `?` or stray `a` would otherwise match nearly every page
+    and flatten the relevance ranking.
+    """
+    assert _tokenize_query("How do I configure a TURN server?") == [
+        "how",
+        "do",
+        "configure",
+        "turn",
+        "server",
+    ]
+
+
+def test_tokenize_query_empty_input_returns_empty():
+    assert _tokenize_query("") == []
+    assert _tokenize_query("?? // !!") == []
+
+
+def test_strip_frontmatter_removes_yaml_block():
+    body = '---\ntitle: "X"\n---\n\n# Heading\n'
+    assert _strip_frontmatter(body).startswith("# Heading")
+
+
+def test_strip_frontmatter_passes_through_when_missing():
+    body = "# Just a heading\nbody text\n"
+    assert _strip_frontmatter(body) == body
+
+
+def test_extract_page_title_prefers_frontmatter():
+    body = '---\ntitle: "Front Title"\n---\n\n# Heading Title\n'
+    assert _extract_page_title(body, fallback="x.mdx") == "Front Title"
+
+
+def test_extract_page_title_falls_back_to_first_heading():
+    """When frontmatter is missing the first ATX heading is the next best
+    signal — better than just returning the filename, which often is
+    a slug not a human-readable title."""
+    body = "# Heading Title\nbody\n"
+    assert _extract_page_title(body, fallback="x.mdx") == "Heading Title"
+
+
+def test_extract_page_title_falls_back_to_filename_when_nothing_matches():
+    body = "plain prose with no heading or frontmatter"
+    assert _extract_page_title(body, fallback="x.mdx") == "x.mdx"
+
+
+def test_docs_url_for_strips_extension_and_index():
+    assert (
+        _docs_url_for("configurations/voice.mdx")
+        == "https://docs.dograh.com/configurations/voice"
+    )
+    assert (
+        _docs_url_for("getting-started/index.mdx")
+        == "https://docs.dograh.com/getting-started"
+    )
+
+
+def test_score_page_weights_title_above_body():
+    """Title hits must outweigh body hits — otherwise a long page that
+    incidentally mentions the term many times outranks the page whose
+    purpose IS the term."""
+    title_only = _score_page(
+        rel_path="other.mdx", title="TURN server", body="unrelated text", terms=["turn"]
+    )
+    body_only = _score_page(
+        rel_path="other.mdx",
+        title="Unrelated",
+        body="turn turn turn turn turn",
+        terms=["turn"],
+    )
+    assert title_only > body_only
+
+
+def test_score_page_returns_zero_when_no_terms_match():
+    assert (
+        _score_page(
+            rel_path="x.mdx", title="X", body="hello world", terms=["nonexistent"]
+        )
+        == 0
+    )
+
+
+def test_resolve_docs_root_honors_env_override(tmp_path: Path):
+    docs = tmp_path / "custom_docs"
+    docs.mkdir()
+    with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(docs)}):
+        assert _resolve_docs_root() == docs.resolve()
+
+
+def test_resolve_docs_root_ignores_nonexistent_env_value(tmp_path: Path):
+    """A bogus env value must not crash the tool — fall back to discovery
+    (the real ``docs/`` in the repo) instead."""
+    with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(tmp_path / "nope")}):
+        # Walk-up discovery should land somewhere (the repo's actual docs)
+        # but we don't assert the exact path because it depends on where
+        # the tests are run; we just assert no crash and either None or a dir.
+        resolved = _resolve_docs_root()
+        assert resolved is None or resolved.is_dir()
+
+
+# ─── End-to-end tool behaviour ───────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_search_docs_ranks_turn_setup_first_for_turn_query(
+    fake_docs_root, authed_user
+):
+    """The page whose title and body are both about TURN must outrank
+    incidental mentions of related words on other pages."""
+    results = await search_docs("How do I set up a TURN server?")
+    assert results, "expected at least one result"
+    assert results[0]["path"] == "deployment/turn-server.mdx"
+    assert results[0]["url"] == "https://docs.dograh.com/deployment/turn-server"
+    assert "TURN server" in results[0]["title"]
+    assert "TURN" in results[0]["snippet"] or "turn" in results[0]["snippet"].lower()
+
+
+@pytest.mark.asyncio
+async def test_search_docs_excludes_non_doc_files(fake_docs_root, authed_user):
+    """``docs.json`` must not appear — the corpus loader filters to
+    .mdx/.md only."""
+    results = await search_docs("Dograh")
+    paths = [r["path"] for r in results]
+    assert "docs.json" not in paths
+
+
+@pytest.mark.asyncio
+async def test_search_docs_returns_empty_when_no_match(fake_docs_root, authed_user):
+    results = await search_docs("xyzzy unrelated zzz")
+    assert results == []
+
+
+@pytest.mark.asyncio
+async def test_search_docs_respects_limit(fake_docs_root, authed_user):
+    """``limit=1`` must collapse the result list even if multiple pages
+    match."""
+    results = await search_docs("Dograh", limit=1)
+    assert len(results) == 1
+
+
+@pytest.mark.asyncio
+async def test_search_docs_clamps_limit_to_hard_cap(fake_docs_root, authed_user):
+    """A pathological large limit must be clamped to
+    ``DOCS_SEARCH_MAX_LIMIT`` (=25) so the payload stays bounded."""
+    # Drop in extra docs so there's headroom to verify the clamp.
+    for i in range(30):
+        (fake_docs_root / f"extra-{i}.mdx").write_text(
+            f"# Page {i}\nThis Dograh page covers configurations topic {i}.\n",
+            encoding="utf-8",
+        )
+    docs_search_module._docs_corpus.cache_clear()
+    results = await search_docs("Dograh", limit=999)
+    assert len(results) <= 25
+
+
+@pytest.mark.asyncio
+async def test_search_docs_returns_empty_when_no_corpus(
+    tmp_path, authed_user, monkeypatch
+):
+    """If the docs directory doesn't exist on disk, the tool must
+    degrade to an empty list rather than raising — Docker images and
+    dev checkouts can disagree on layout."""
+    nonexistent = tmp_path / "no-docs-here"
+    monkeypatch.setenv("DOGRAH_DOCS_PATH", str(nonexistent))
+    # Also block the walk-up fallback by pointing the resolver at a
+    # tmp path with no `docs/` ancestor.
+    docs_search_module._docs_corpus.cache_clear()
+    with patch(
+        "api.mcp_server.tools.docs_search._resolve_docs_root", return_value=None
+    ):
+        results = await search_docs("anything")
+    assert results == []
+
+
+@pytest.mark.asyncio
+async def test_search_docs_rejects_empty_query(fake_docs_root, authed_user):
+    with pytest.raises(ValueError, match="non-empty string"):
+        await search_docs("")
+
+
+@pytest.mark.asyncio
+async def test_search_docs_rejects_query_with_no_real_terms(
+    fake_docs_root, authed_user
+):
+    """A query like `"???"` tokenizes to nothing — surface an actionable
+    error rather than silently returning every page."""
+    with pytest.raises(ValueError, match="2\\+ alphanumeric"):
+        await search_docs("?? // !!")
+
+
+@pytest.mark.asyncio
+async def test_search_docs_rejects_zero_limit(fake_docs_root, authed_user):
+    with pytest.raises(ValueError, match="at least 1"):
+        await search_docs("Dograh", limit=0)