mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-05-19 18:35:16 +02:00
- Collection.query and Backend.query/query_stream accept doc_ids as str, list[str] or None. Single str is normalized to [str] inside each backend; bare [] is rejected with ValueError at both layers. - wrap_with_doc_context wraps the scoped doc list in <docs>...</docs> and SCOPED_SYSTEM_PROMPT instructs the agent to treat that block as data, not instructions (defense against prompt injection via auto-generated doc_description). - _require_cloud_api now distinguishes api_key="" from api_key=None; the former gives a targeted error pointing at the empty-string vs fall-back-to-local situation when legacy SDK methods are called. - Legacy PageIndexClient.list_documents docstring spells out the return-shape difference vs collection.list_documents() to flag a silent migration footgun (paginated dict with id/name keys vs plain list[dict] with doc_id/doc_name keys). - Remove dead CloudBackend.get_agent_tools stub (not on the Backend protocol; only ever returned an empty AgentTools()) and the SYSTEM_PROMPT alias (OPEN_/SCOPED_SYSTEM_PROMPT are the explicit names now). - README quick start and streaming example now pass doc_ids; new multi-document section shows both str and list forms. - examples/demo_query_modes.py exercises all five query-mode cases (single-doc, multi-doc with/without env var, scoped single, scoped multi) for manual verification.
96 lines
3.1 KiB
Python
96 lines
3.1 KiB
Python
# tests/sdk/test_collection.py
|
|
import pytest
|
|
from unittest.mock import MagicMock
|
|
from pageindex.collection import Collection
|
|
|
|
|
|
@pytest.fixture
|
|
def col():
|
|
backend = MagicMock()
|
|
backend.list_documents.return_value = [
|
|
{"doc_id": "d1", "doc_name": "paper.pdf", "doc_type": "pdf"}
|
|
]
|
|
backend.get_document.return_value = {"doc_id": "d1", "doc_name": "paper.pdf"}
|
|
backend.add_document.return_value = "d1"
|
|
return Collection(name="papers", backend=backend)
|
|
|
|
|
|
def test_add(col):
|
|
doc_id = col.add("paper.pdf")
|
|
assert doc_id == "d1"
|
|
col._backend.add_document.assert_called_once_with("papers", "paper.pdf")
|
|
|
|
|
|
def test_list_documents(col):
|
|
docs = col.list_documents()
|
|
assert len(docs) == 1
|
|
assert docs[0]["doc_id"] == "d1"
|
|
|
|
|
|
def test_get_document(col):
|
|
doc = col.get_document("d1")
|
|
assert doc["doc_name"] == "paper.pdf"
|
|
|
|
|
|
def test_delete_document(col):
|
|
col.delete_document("d1")
|
|
col._backend.delete_document.assert_called_once_with("papers", "d1")
|
|
|
|
|
|
def test_name_property(col):
|
|
assert col.name == "papers"
|
|
|
|
|
|
def test_query_without_doc_ids_warns_when_multidoc(col, monkeypatch):
|
|
monkeypatch.delenv("PAGEINDEX_EXPERIMENTAL_MULTIDOC", raising=False)
|
|
col._backend.list_documents.return_value = [
|
|
{"doc_id": "d1", "doc_name": "a.pdf", "doc_type": "pdf"},
|
|
{"doc_id": "d2", "doc_name": "b.pdf", "doc_type": "pdf"},
|
|
]
|
|
col._backend.query.return_value = "answer"
|
|
with pytest.warns(UserWarning, match="experimental"):
|
|
result = col.query("what?")
|
|
assert result == "answer"
|
|
|
|
|
|
def test_query_without_doc_ids_no_warning_when_single_doc(col, monkeypatch, recwarn):
|
|
monkeypatch.delenv("PAGEINDEX_EXPERIMENTAL_MULTIDOC", raising=False)
|
|
col._backend.query.return_value = "answer"
|
|
col.query("what?")
|
|
assert not any(issubclass(w.category, UserWarning) for w in recwarn)
|
|
|
|
|
|
def test_query_empty_collection_raises(col, monkeypatch):
|
|
monkeypatch.delenv("PAGEINDEX_EXPERIMENTAL_MULTIDOC", raising=False)
|
|
col._backend.list_documents.return_value = []
|
|
with pytest.raises(ValueError, match="empty"):
|
|
col.query("what?")
|
|
|
|
|
|
def test_query_with_doc_ids_no_warning(col, recwarn):
|
|
col._backend.query.return_value = "answer"
|
|
col.query("what?", doc_ids=["d1"])
|
|
assert not any(issubclass(w.category, UserWarning) for w in recwarn)
|
|
|
|
|
|
def test_query_env_var_silences_warning(col, monkeypatch, recwarn):
|
|
monkeypatch.setenv("PAGEINDEX_EXPERIMENTAL_MULTIDOC", "1")
|
|
col._backend.list_documents.return_value = [
|
|
{"doc_id": "d1", "doc_name": "a.pdf", "doc_type": "pdf"},
|
|
{"doc_id": "d2", "doc_name": "b.pdf", "doc_type": "pdf"},
|
|
]
|
|
col._backend.query.return_value = "answer"
|
|
col.query("what?")
|
|
assert not any(issubclass(w.category, UserWarning) for w in recwarn)
|
|
|
|
|
|
def test_query_accepts_str_doc_id(col):
|
|
"""str gets normalized to [str] internally."""
|
|
col._backend.query.return_value = "answer"
|
|
col.query("what?", doc_ids="d1")
|
|
col._backend.query.assert_called_once_with("papers", "what?", ["d1"])
|
|
|
|
|
|
def test_query_rejects_empty_list(col):
|
|
with pytest.raises(ValueError, match="cannot be empty"):
|
|
col.query("what?", doc_ids=[])
|