PageIndex/tests/test_local_backend.py
mountain a47c36a3f5 feat(collection): doc_ids accepts str|list, design cleanups
- Collection.query and Backend.query/query_stream accept doc_ids as
  str, list[str] or None. Single str is normalized to [str] inside each
  backend; bare [] is rejected with ValueError at both layers.
- wrap_with_doc_context wraps the scoped doc list in <docs>...</docs>
  and SCOPED_SYSTEM_PROMPT instructs the agent to treat that block as
  data, not instructions (defense against prompt injection via
  auto-generated doc_description).
- _require_cloud_api now distinguishes api_key="" from api_key=None;
  the former gives a targeted error pointing at the empty-string vs
  fall-back-to-local situation when legacy SDK methods are called.
- Legacy PageIndexClient.list_documents docstring spells out the
  return-shape difference vs collection.list_documents() to flag a
  silent migration footgun (paginated dict with id/name keys vs plain
  list[dict] with doc_id/doc_name keys).
- Remove dead CloudBackend.get_agent_tools stub (not on the Backend
  protocol; only ever returned an empty AgentTools()) and the
  SYSTEM_PROMPT alias (OPEN_/SCOPED_SYSTEM_PROMPT are the explicit
  names now).
- README quick start and streaming example now pass doc_ids; new
  multi-document section shows both str and list forms.
- examples/demo_query_modes.py exercises all five query-mode cases
  (single-doc, multi-doc with/without env var, scoped single, scoped
  multi) for manual verification.
2026-05-15 17:03:17 +08:00

143 lines
5.5 KiB
Python

# tests/sdk/test_local_backend.py
import asyncio
import json
import pytest
from pathlib import Path
from pageindex.backend.local import LocalBackend
from pageindex.storage.sqlite import SQLiteStorage
from pageindex.errors import FileTypeError, DocumentNotFoundError
@pytest.fixture
def backend(tmp_path):
storage = SQLiteStorage(str(tmp_path / "test.db"))
files_dir = tmp_path / "files"
return LocalBackend(storage=storage, files_dir=str(files_dir), model="gpt-4o")
def test_collection_lifecycle(backend):
backend.get_or_create_collection("papers")
assert "papers" in backend.list_collections()
backend.delete_collection("papers")
assert "papers" not in backend.list_collections()
def test_list_documents_empty(backend):
backend.get_or_create_collection("papers")
assert backend.list_documents("papers") == []
def test_unsupported_file_type_raises(backend, tmp_path):
backend.get_or_create_collection("papers")
bad_file = tmp_path / "test.xyz"
bad_file.write_text("hello")
with pytest.raises(FileTypeError):
backend.add_document("papers", str(bad_file))
def test_register_custom_parser(backend):
from pageindex.parser.protocol import ParsedDocument, ContentNode
class TxtParser:
def supported_extensions(self):
return [".txt"]
def parse(self, file_path, **kwargs):
text = Path(file_path).read_text()
return ParsedDocument(doc_name="test", nodes=[
ContentNode(content=text, tokens=len(text.split()), title="Content", index=1, level=1)
])
backend.register_parser(TxtParser())
# Now .txt should be supported (won't raise FileTypeError)
assert backend._resolve_parser("test.txt") is not None
# ── Scoped-mode agent tools ──────────────────────────────────────────────────
@pytest.fixture
def populated_backend(backend):
"""Backend with a 'papers' collection containing two stub docs."""
backend.get_or_create_collection("papers")
for did, name, desc in [
("d1", "alpha.pdf", "About alpha."),
("d2", "beta.pdf", "About beta."),
]:
backend._storage.save_document("papers", did, {
"doc_name": name, "doc_description": desc,
"doc_type": "pdf", "file_path": f"/tmp/{name}", "structure": [],
})
return backend
def _invoke_tool(tool, args: dict) -> str:
"""Run a FunctionTool synchronously with a minimal ToolContext."""
from agents.tool_context import ToolContext
ctx = ToolContext(context=None, tool_name=tool.name,
tool_call_id="test", tool_arguments=json.dumps(args))
return asyncio.run(tool.on_invoke_tool(ctx, json.dumps(args)))
def test_open_mode_includes_list_documents(populated_backend):
tools = populated_backend.get_agent_tools("papers", doc_ids=None)
names = {t.name for t in tools.function_tools}
assert names == {"list_documents", "get_document", "get_document_structure", "get_page_content"}
def test_scoped_mode_excludes_list_documents(populated_backend):
tools = populated_backend.get_agent_tools("papers", doc_ids=["d1"])
names = {t.name for t in tools.function_tools}
assert "list_documents" not in names
assert names == {"get_document", "get_document_structure", "get_page_content"}
def test_scoped_mode_rejects_out_of_scope_doc_id(populated_backend):
tools = populated_backend.get_agent_tools("papers", doc_ids=["d1"])
by_name = {t.name: t for t in tools.function_tools}
out = json.loads(_invoke_tool(by_name["get_document"], {"doc_id": "d2"}))
assert "error" in out
assert "not in scope" in out["error"]
assert out["allowed_doc_ids"] == ["d1"]
def test_scoped_mode_allows_in_scope_doc_id(populated_backend):
tools = populated_backend.get_agent_tools("papers", doc_ids=["d1"])
by_name = {t.name: t for t in tools.function_tools}
out = json.loads(_invoke_tool(by_name["get_document"], {"doc_id": "d1"}))
assert out.get("doc_name") == "alpha.pdf"
def test_wrap_with_doc_context_single(populated_backend):
from pageindex.agent import wrap_with_doc_context
docs = populated_backend._scoped_docs("papers", ["d1"])
wrapped = wrap_with_doc_context(docs, "what is this?")
assert "d1: alpha.pdf — About alpha." in wrapped
assert "specified the following document" in wrapped
assert "<docs>" in wrapped and "</docs>" in wrapped
assert "User question: what is this?" in wrapped
def test_wrap_with_doc_context_multi(populated_backend):
from pageindex.agent import wrap_with_doc_context
docs = populated_backend._scoped_docs("papers", ["d1", "d2"])
wrapped = wrap_with_doc_context(docs, "compare them")
assert "d1: alpha.pdf — About alpha." in wrapped
assert "d2: beta.pdf — About beta." in wrapped
assert "specified the following documents" in wrapped
assert "<docs>" in wrapped and "</docs>" in wrapped
assert "User question: compare them" in wrapped
def test_scoped_docs_raises_on_missing(populated_backend):
with pytest.raises(DocumentNotFoundError, match="nonexistent"):
populated_backend._scoped_docs("papers", ["d1", "nonexistent"])
def test_normalize_doc_ids():
assert LocalBackend._normalize_doc_ids("d1") == ["d1"]
assert LocalBackend._normalize_doc_ids(["d1", "d2"]) == ["d1", "d2"]
assert LocalBackend._normalize_doc_ids(None) is None
def test_normalize_doc_ids_rejects_empty_list():
with pytest.raises(ValueError, match="cannot be empty"):
LocalBackend._normalize_doc_ids([])