PageIndex/tests/test_local_backend.py

# tests/sdk/test_local_backend.py
import asyncio
import json
import pytest
from pathlib import Path
from pageindex.backend.local import LocalBackend
from pageindex.storage.sqlite import SQLiteStorage
from pageindex.errors import FileTypeError, DocumentNotFoundError


@pytest.fixture
def backend(tmp_path):
    storage = SQLiteStorage(str(tmp_path / "test.db"))
    files_dir = tmp_path / "files"
    return LocalBackend(storage=storage, files_dir=str(files_dir), model="gpt-4o")


def test_collection_lifecycle(backend):
    backend.get_or_create_collection("papers")
    assert "papers" in backend.list_collections()
    backend.delete_collection("papers")
    assert "papers" not in backend.list_collections()


def test_list_documents_empty(backend):
    backend.get_or_create_collection("papers")
    assert backend.list_documents("papers") == []


def test_unsupported_file_type_raises(backend, tmp_path):
    backend.get_or_create_collection("papers")
    bad_file = tmp_path / "test.xyz"
    bad_file.write_text("hello")
    with pytest.raises(FileTypeError):
        backend.add_document("papers", str(bad_file))


def test_register_custom_parser(backend):
    from pageindex.parser.protocol import ParsedDocument, ContentNode

    class TxtParser:
        def supported_extensions(self):
            return [".txt"]
        def parse(self, file_path, **kwargs):
            text = Path(file_path).read_text()
            return ParsedDocument(doc_name="test", nodes=[
                ContentNode(content=text, tokens=len(text.split()), title="Content", index=1, level=1)
            ])

    backend.register_parser(TxtParser())
    # Now .txt should be supported (won't raise FileTypeError)
    assert backend._resolve_parser("test.txt") is not None


# ── Scoped-mode agent tools ──────────────────────────────────────────────────

@pytest.fixture
def populated_backend(backend):
    """Backend with a 'papers' collection containing two stub docs."""
    backend.get_or_create_collection("papers")
    for did, name, desc in [
        ("d1", "alpha.pdf", "About alpha."),
        ("d2", "beta.pdf", "About beta."),
    ]:
        backend._storage.save_document("papers", did, {
            "doc_name": name, "doc_description": desc,
            "doc_type": "pdf", "file_path": f"/tmp/{name}", "structure": [],
        })
    return backend


def _invoke_tool(tool, args: dict) -> str:
    """Run a FunctionTool synchronously with a minimal ToolContext."""
    from agents.tool_context import ToolContext
    ctx = ToolContext(context=None, tool_name=tool.name,
                      tool_call_id="test", tool_arguments=json.dumps(args))
    return asyncio.run(tool.on_invoke_tool(ctx, json.dumps(args)))


def test_open_mode_includes_list_documents(populated_backend):
    tools = populated_backend.get_agent_tools("papers", doc_ids=None)
    names = {t.name for t in tools.function_tools}
    assert names == {"list_documents", "get_document", "get_document_structure", "get_page_content"}


def test_scoped_mode_excludes_list_documents(populated_backend):
    tools = populated_backend.get_agent_tools("papers", doc_ids=["d1"])
    names = {t.name for t in tools.function_tools}
    assert "list_documents" not in names
    assert names == {"get_document", "get_document_structure", "get_page_content"}


def test_scoped_mode_rejects_out_of_scope_doc_id(populated_backend):
    tools = populated_backend.get_agent_tools("papers", doc_ids=["d1"])
    by_name = {t.name: t for t in tools.function_tools}
    out = json.loads(_invoke_tool(by_name["get_document"], {"doc_id": "d2"}))
    assert "error" in out
    assert "not in scope" in out["error"]
    assert out["allowed_doc_ids"] == ["d1"]


def test_scoped_mode_allows_in_scope_doc_id(populated_backend):
    tools = populated_backend.get_agent_tools("papers", doc_ids=["d1"])
    by_name = {t.name: t for t in tools.function_tools}
    out = json.loads(_invoke_tool(by_name["get_document"], {"doc_id": "d1"}))
    assert out.get("doc_name") == "alpha.pdf"


def test_wrap_with_doc_context_single(populated_backend):
    from pageindex.agent import wrap_with_doc_context
    docs = populated_backend._scoped_docs("papers", ["d1"])
    wrapped = wrap_with_doc_context(docs, "what is this?")
    assert "d1: alpha.pdf — About alpha." in wrapped
    assert "specified the following document" in wrapped
    assert "<docs>" in wrapped and "</docs>" in wrapped
    assert "User question: what is this?" in wrapped


def test_wrap_with_doc_context_multi(populated_backend):
    from pageindex.agent import wrap_with_doc_context
    docs = populated_backend._scoped_docs("papers", ["d1", "d2"])
    wrapped = wrap_with_doc_context(docs, "compare them")
    assert "d1: alpha.pdf — About alpha." in wrapped
    assert "d2: beta.pdf — About beta." in wrapped
    assert "specified the following documents" in wrapped
    assert "<docs>" in wrapped and "</docs>" in wrapped
    assert "User question: compare them" in wrapped


def test_scoped_docs_raises_on_missing(populated_backend):
    with pytest.raises(DocumentNotFoundError, match="nonexistent"):
        populated_backend._scoped_docs("papers", ["d1", "nonexistent"])


def test_normalize_doc_ids():
    assert LocalBackend._normalize_doc_ids("d1") == ["d1"]
    assert LocalBackend._normalize_doc_ids(["d1", "d2"]) == ["d1", "d2"]
    assert LocalBackend._normalize_doc_ids(None) is None


def test_normalize_doc_ids_rejects_empty_list():
    with pytest.raises(ValueError, match="cannot be empty"):
        LocalBackend._normalize_doc_ids([])