mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-05-19 18:35:16 +02:00
feat(collection): scoped query mode and experimental multi-doc warning
- get_agent_tools branches on doc_ids:
- scoped (doc_ids=[...]): drops list_documents and hard-enforces a
whitelist on the remaining tools; system prompt switches to
SCOPED_SYSTEM_PROMPT (no list_documents instruction); doc list +
summaries are prepended to the user message via wrap_with_doc_context.
- open (doc_ids=None): unchanged 4-tool agent loop.
- list_documents now exposes doc_description (sqlite + cloud).
- Collection.query emits UserWarning when doc_ids is None and the
collection holds >1 documents; PAGEINDEX_EXPERIMENTAL_MULTIDOC=1
silences it. Single-doc collections skip the warning; empty
collections raise ValueError.
- Agents SDK tracing upload disabled by default (avoids SSL timeouts);
PAGEINDEX_AGENTS_TRACING=1 re-enables it.
- README: new SDK Usage section covering local/cloud quick start,
streaming, multi-doc as experimental, and runnable examples.
This commit is contained in:
parent
cbea31d1a2
commit
d7b36aaf3f
8 changed files with 348 additions and 25 deletions
|
|
@ -39,3 +39,46 @@ def test_delete_document(col):
|
|||
|
||||
def test_name_property(col):
|
||||
assert col.name == "papers"
|
||||
|
||||
|
||||
def test_query_without_doc_ids_warns_when_multidoc(col, monkeypatch):
|
||||
monkeypatch.delenv("PAGEINDEX_EXPERIMENTAL_MULTIDOC", raising=False)
|
||||
col._backend.list_documents.return_value = [
|
||||
{"doc_id": "d1", "doc_name": "a.pdf", "doc_type": "pdf"},
|
||||
{"doc_id": "d2", "doc_name": "b.pdf", "doc_type": "pdf"},
|
||||
]
|
||||
col._backend.query.return_value = "answer"
|
||||
with pytest.warns(UserWarning, match="experimental"):
|
||||
result = col.query("what?")
|
||||
assert result == "answer"
|
||||
|
||||
|
||||
def test_query_without_doc_ids_no_warning_when_single_doc(col, monkeypatch, recwarn):
|
||||
monkeypatch.delenv("PAGEINDEX_EXPERIMENTAL_MULTIDOC", raising=False)
|
||||
col._backend.query.return_value = "answer"
|
||||
col.query("what?")
|
||||
assert not any(issubclass(w.category, UserWarning) for w in recwarn)
|
||||
|
||||
|
||||
def test_query_empty_collection_raises(col, monkeypatch):
|
||||
monkeypatch.delenv("PAGEINDEX_EXPERIMENTAL_MULTIDOC", raising=False)
|
||||
col._backend.list_documents.return_value = []
|
||||
with pytest.raises(ValueError, match="empty"):
|
||||
col.query("what?")
|
||||
|
||||
|
||||
def test_query_with_doc_ids_no_warning(col, recwarn):
|
||||
col._backend.query.return_value = "answer"
|
||||
col.query("what?", doc_ids=["d1"])
|
||||
assert not any(issubclass(w.category, UserWarning) for w in recwarn)
|
||||
|
||||
|
||||
def test_query_env_var_silences_warning(col, monkeypatch, recwarn):
|
||||
monkeypatch.setenv("PAGEINDEX_EXPERIMENTAL_MULTIDOC", "1")
|
||||
col._backend.list_documents.return_value = [
|
||||
{"doc_id": "d1", "doc_name": "a.pdf", "doc_type": "pdf"},
|
||||
{"doc_id": "d2", "doc_name": "b.pdf", "doc_type": "pdf"},
|
||||
]
|
||||
col._backend.query.return_value = "answer"
|
||||
col.query("what?")
|
||||
assert not any(issubclass(w.category, UserWarning) for w in recwarn)
|
||||
|
|
|
|||
|
|
@ -1,9 +1,11 @@
|
|||
# tests/sdk/test_local_backend.py
|
||||
import asyncio
|
||||
import json
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from pageindex.backend.local import LocalBackend
|
||||
from pageindex.storage.sqlite import SQLiteStorage
|
||||
from pageindex.errors import FileTypeError
|
||||
from pageindex.errors import FileTypeError, DocumentNotFoundError
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
|
@ -48,3 +50,81 @@ def test_register_custom_parser(backend):
|
|||
backend.register_parser(TxtParser())
|
||||
# Now .txt should be supported (won't raise FileTypeError)
|
||||
assert backend._resolve_parser("test.txt") is not None
|
||||
|
||||
|
||||
# ── Scoped-mode agent tools ──────────────────────────────────────────────────
|
||||
|
||||
@pytest.fixture
|
||||
def populated_backend(backend):
|
||||
"""Backend with a 'papers' collection containing two stub docs."""
|
||||
backend.get_or_create_collection("papers")
|
||||
for did, name, desc in [
|
||||
("d1", "alpha.pdf", "About alpha."),
|
||||
("d2", "beta.pdf", "About beta."),
|
||||
]:
|
||||
backend._storage.save_document("papers", did, {
|
||||
"doc_name": name, "doc_description": desc,
|
||||
"doc_type": "pdf", "file_path": f"/tmp/{name}", "structure": [],
|
||||
})
|
||||
return backend
|
||||
|
||||
|
||||
def _invoke_tool(tool, args: dict) -> str:
|
||||
"""Run a FunctionTool synchronously with a minimal ToolContext."""
|
||||
from agents.tool_context import ToolContext
|
||||
ctx = ToolContext(context=None, tool_name=tool.name,
|
||||
tool_call_id="test", tool_arguments=json.dumps(args))
|
||||
return asyncio.run(tool.on_invoke_tool(ctx, json.dumps(args)))
|
||||
|
||||
|
||||
def test_open_mode_includes_list_documents(populated_backend):
|
||||
tools = populated_backend.get_agent_tools("papers", doc_ids=None)
|
||||
names = {t.name for t in tools.function_tools}
|
||||
assert names == {"list_documents", "get_document", "get_document_structure", "get_page_content"}
|
||||
|
||||
|
||||
def test_scoped_mode_excludes_list_documents(populated_backend):
|
||||
tools = populated_backend.get_agent_tools("papers", doc_ids=["d1"])
|
||||
names = {t.name for t in tools.function_tools}
|
||||
assert "list_documents" not in names
|
||||
assert names == {"get_document", "get_document_structure", "get_page_content"}
|
||||
|
||||
|
||||
def test_scoped_mode_rejects_out_of_scope_doc_id(populated_backend):
|
||||
tools = populated_backend.get_agent_tools("papers", doc_ids=["d1"])
|
||||
by_name = {t.name: t for t in tools.function_tools}
|
||||
out = json.loads(_invoke_tool(by_name["get_document"], {"doc_id": "d2"}))
|
||||
assert "error" in out
|
||||
assert "not in scope" in out["error"]
|
||||
assert out["allowed_doc_ids"] == ["d1"]
|
||||
|
||||
|
||||
def test_scoped_mode_allows_in_scope_doc_id(populated_backend):
|
||||
tools = populated_backend.get_agent_tools("papers", doc_ids=["d1"])
|
||||
by_name = {t.name: t for t in tools.function_tools}
|
||||
out = json.loads(_invoke_tool(by_name["get_document"], {"doc_id": "d1"}))
|
||||
assert out.get("doc_name") == "alpha.pdf"
|
||||
|
||||
|
||||
def test_wrap_with_doc_context_single(populated_backend):
|
||||
from pageindex.agent import wrap_with_doc_context
|
||||
docs = populated_backend._scoped_docs("papers", ["d1"])
|
||||
wrapped = wrap_with_doc_context(docs, "what is this?")
|
||||
assert "d1: alpha.pdf — About alpha." in wrapped
|
||||
assert "specified the following document:" in wrapped
|
||||
assert "User question: what is this?" in wrapped
|
||||
|
||||
|
||||
def test_wrap_with_doc_context_multi(populated_backend):
|
||||
from pageindex.agent import wrap_with_doc_context
|
||||
docs = populated_backend._scoped_docs("papers", ["d1", "d2"])
|
||||
wrapped = wrap_with_doc_context(docs, "compare them")
|
||||
assert "d1: alpha.pdf — About alpha." in wrapped
|
||||
assert "d2: beta.pdf — About beta." in wrapped
|
||||
assert "specified the following documents:" in wrapped
|
||||
assert "User question: compare them" in wrapped
|
||||
|
||||
|
||||
def test_scoped_docs_raises_on_missing(populated_backend):
|
||||
with pytest.raises(DocumentNotFoundError, match="nonexistent"):
|
||||
populated_backend._scoped_docs("papers", ["d1", "nonexistent"])
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue