dograh/api/tests/test_mcp_docs_search.py

"""Unit tests for the `search_docs` MCP tool.

The tool reads the docs corpus from disk via ``_resolve_docs_root`` and
caches it with ``functools.lru_cache``. These tests point the cache at
a synthetic corpus per-test so the assertions don't depend on the real
docs tree (which evolves) and the LRU cache doesn't leak state.

`authenticate_mcp_request` is mocked so the tests don't need a live DB
or a valid API key — mirroring the pattern in
``test_mcp_save_workflow.py``.
"""

from __future__ import annotations

import os
from pathlib import Path
from unittest.mock import AsyncMock, patch

import pytest

from api.mcp_server.tools import docs_search as docs_search_module
from api.mcp_server.tools.docs_search import (
    _docs_url_for,
    _extract_page_title,
    _resolve_docs_root,
    _score_page,
    _strip_frontmatter,
    _tokenize_query,
    search_docs,
)


# ─── Fixtures ────────────────────────────────────────────────────────────


@pytest.fixture
def fake_docs_root(tmp_path: Path) -> Path:
    """Build a minimal docs tree on disk and point the tool at it."""
    docs_root = tmp_path / "docs"
    docs_root.mkdir()

    (docs_root / "configurations").mkdir()
    (docs_root / "configurations" / "voice.mdx").write_text(
        "---\n"
        'title: "Voice"\n'
        "---\n\n"
        "# Voice configuration\n\n"
        "Dograh supports ElevenLabs and Cartesia TTS providers.\n"
        "Configure the ElevenLabs voice_id in your workspace settings.\n",
        encoding="utf-8",
    )
    (docs_root / "configurations" / "transcriber.mdx").write_text(
        "---\n"
        'title: "Transcriber"\n'
        "---\n\n"
        "# Speech-to-text\n\nDeepgram is the default transcriber.\n",
        encoding="utf-8",
    )

    (docs_root / "deployment").mkdir()
    (docs_root / "deployment" / "turn-server.mdx").write_text(
        "---\n"
        'title: "TURN server setup"\n'
        "---\n\n"
        "# TURN server\n\n"
        "WebRTC requires a TURN server for NAT traversal. Coturn is the "
        "recommended choice for self-hosted deployments.\n",
        encoding="utf-8",
    )

    # A non-doc file that must be ignored by the corpus loader.
    (docs_root / "docs.json").write_text('{"name":"Dograh"}', encoding="utf-8")

    # Reset the LRU cache and pin the resolver to our tmp tree.
    docs_search_module._docs_corpus.cache_clear()
    with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(docs_root)}):
        yield docs_root
    docs_search_module._docs_corpus.cache_clear()


@pytest.fixture
def authed_user():
    """Stub ``authenticate_mcp_request`` so tests skip the API-key path."""

    class _FakeUser:
        selected_organization_id = 1
        id = 42

    with patch(
        "api.mcp_server.tools.docs_search.authenticate_mcp_request",
        new=AsyncMock(return_value=_FakeUser()),
    ):
        yield _FakeUser()


# ─── Pure helpers ────────────────────────────────────────────────────────


def test_tokenize_query_strips_short_and_punct_terms():
    """Punctuation and 1-char tokens must not bleed into the scorer.

    A trailing `?` or stray `a` would otherwise match nearly every page
    and flatten the relevance ranking.
    """
    assert _tokenize_query("How do I configure a TURN server?") == [
        "how",
        "do",
        "configure",
        "turn",
        "server",
    ]


def test_tokenize_query_empty_input_returns_empty():
    assert _tokenize_query("") == []
    assert _tokenize_query("?? // !!") == []


def test_strip_frontmatter_removes_yaml_block():
    body = '---\ntitle: "X"\n---\n\n# Heading\n'
    assert _strip_frontmatter(body).startswith("# Heading")


def test_strip_frontmatter_passes_through_when_missing():
    body = "# Just a heading\nbody text\n"
    assert _strip_frontmatter(body) == body


def test_extract_page_title_prefers_frontmatter():
    body = '---\ntitle: "Front Title"\n---\n\n# Heading Title\n'
    assert _extract_page_title(body, fallback="x.mdx") == "Front Title"


def test_extract_page_title_falls_back_to_first_heading():
    """When frontmatter is missing the first ATX heading is the next best
    signal — better than just returning the filename, which often is
    a slug not a human-readable title."""
    body = "# Heading Title\nbody\n"
    assert _extract_page_title(body, fallback="x.mdx") == "Heading Title"


def test_extract_page_title_falls_back_to_filename_when_nothing_matches():
    body = "plain prose with no heading or frontmatter"
    assert _extract_page_title(body, fallback="x.mdx") == "x.mdx"


def test_docs_url_for_strips_extension_and_index():
    assert (
        _docs_url_for("configurations/voice.mdx")
        == "https://docs.dograh.com/configurations/voice"
    )
    assert (
        _docs_url_for("getting-started/index.mdx")
        == "https://docs.dograh.com/getting-started"
    )


def test_score_page_weights_title_above_body():
    """Title hits must outweigh body hits — otherwise a long page that
    incidentally mentions the term many times outranks the page whose
    purpose IS the term."""
    title_only = _score_page(
        rel_path="other.mdx", title="TURN server", body="unrelated text", terms=["turn"]
    )
    body_only = _score_page(
        rel_path="other.mdx",
        title="Unrelated",
        body="turn turn turn turn turn",
        terms=["turn"],
    )
    assert title_only > body_only


def test_score_page_returns_zero_when_no_terms_match():
    assert (
        _score_page(
            rel_path="x.mdx", title="X", body="hello world", terms=["nonexistent"]
        )
        == 0
    )


def test_resolve_docs_root_honors_env_override(tmp_path: Path):
    docs = tmp_path / "custom_docs"
    docs.mkdir()
    with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(docs)}):
        assert _resolve_docs_root() == docs.resolve()


def test_resolve_docs_root_ignores_nonexistent_env_value(tmp_path: Path):
    """A bogus env value must not crash the tool — fall back to discovery
    (the real ``docs/`` in the repo) instead."""
    with patch.dict(os.environ, {"DOGRAH_DOCS_PATH": str(tmp_path / "nope")}):
        # Walk-up discovery should land somewhere (the repo's actual docs)
        # but we don't assert the exact path because it depends on where
        # the tests are run; we just assert no crash and either None or a dir.
        resolved = _resolve_docs_root()
        assert resolved is None or resolved.is_dir()


# ─── End-to-end tool behaviour ───────────────────────────────────────────


@pytest.mark.asyncio
async def test_search_docs_ranks_turn_setup_first_for_turn_query(
    fake_docs_root, authed_user
):
    """The page whose title and body are both about TURN must outrank
    incidental mentions of related words on other pages."""
    results = await search_docs("How do I set up a TURN server?")
    assert results, "expected at least one result"
    assert results[0]["path"] == "deployment/turn-server.mdx"
    assert results[0]["url"] == "https://docs.dograh.com/deployment/turn-server"
    assert "TURN server" in results[0]["title"]
    assert "TURN" in results[0]["snippet"] or "turn" in results[0]["snippet"].lower()


@pytest.mark.asyncio
async def test_search_docs_excludes_non_doc_files(fake_docs_root, authed_user):
    """``docs.json`` must not appear — the corpus loader filters to
    .mdx/.md only."""
    results = await search_docs("Dograh")
    paths = [r["path"] for r in results]
    assert "docs.json" not in paths


@pytest.mark.asyncio
async def test_search_docs_returns_empty_when_no_match(fake_docs_root, authed_user):
    results = await search_docs("xyzzy unrelated zzz")
    assert results == []


@pytest.mark.asyncio
async def test_search_docs_respects_limit(fake_docs_root, authed_user):
    """``limit=1`` must collapse the result list even if multiple pages
    match."""
    results = await search_docs("Dograh", limit=1)
    assert len(results) == 1


@pytest.mark.asyncio
async def test_search_docs_clamps_limit_to_hard_cap(fake_docs_root, authed_user):
    """A pathological large limit must be clamped to
    ``DOCS_SEARCH_MAX_LIMIT`` (=25) so the payload stays bounded."""
    # Drop in extra docs so there's headroom to verify the clamp.
    for i in range(30):
        (fake_docs_root / f"extra-{i}.mdx").write_text(
            f"# Page {i}\nThis Dograh page covers configurations topic {i}.\n",
            encoding="utf-8",
        )
    docs_search_module._docs_corpus.cache_clear()
    results = await search_docs("Dograh", limit=999)
    assert len(results) <= 25


@pytest.mark.asyncio
async def test_search_docs_returns_empty_when_no_corpus(
    tmp_path, authed_user, monkeypatch
):
    """If the docs directory doesn't exist on disk, the tool must
    degrade to an empty list rather than raising — Docker images and
    dev checkouts can disagree on layout."""
    nonexistent = tmp_path / "no-docs-here"
    monkeypatch.setenv("DOGRAH_DOCS_PATH", str(nonexistent))
    # Also block the walk-up fallback by pointing the resolver at a
    # tmp path with no `docs/` ancestor.
    docs_search_module._docs_corpus.cache_clear()
    with patch(
        "api.mcp_server.tools.docs_search._resolve_docs_root", return_value=None
    ):
        results = await search_docs("anything")
    assert results == []


@pytest.mark.asyncio
async def test_search_docs_rejects_empty_query(fake_docs_root, authed_user):
    with pytest.raises(ValueError, match="non-empty string"):
        await search_docs("")


@pytest.mark.asyncio
async def test_search_docs_rejects_query_with_no_real_terms(
    fake_docs_root, authed_user
):
    """A query like `"???"` tokenizes to nothing — surface an actionable
    error rather than silently returning every page."""
    with pytest.raises(ValueError, match="2\\+ alphanumeric"):
        await search_docs("?? // !!")


@pytest.mark.asyncio
async def test_search_docs_rejects_zero_limit(fake_docs_root, authed_user):
    with pytest.raises(ValueError, match="at least 1"):
        await search_docs("Dograh", limit=0)