feat(collection): scoped query mode and experimental multi-doc warning

- get_agent_tools branches on doc_ids:
  - scoped (doc_ids=[...]): drops list_documents and hard-enforces a
    whitelist on the remaining tools; system prompt switches to
    SCOPED_SYSTEM_PROMPT (no list_documents instruction); doc list +
    summaries are prepended to the user message via wrap_with_doc_context.
  - open (doc_ids=None): unchanged 4-tool agent loop.
- list_documents now exposes doc_description (sqlite + cloud).
- Collection.query emits UserWarning when doc_ids is None and the
  collection holds >1 documents; PAGEINDEX_EXPERIMENTAL_MULTIDOC=1
  silences it. Single-doc collections skip the warning; empty
  collections raise ValueError.
- Agents SDK tracing upload disabled by default (avoids SSL timeouts);
  PAGEINDEX_AGENTS_TRACING=1 re-enables it.
- README: new SDK Usage section covering local/cloud quick start,
  streaming, multi-doc as experimental, and runnable examples.
This commit is contained in:
mountain 2026-05-15 11:14:12 +08:00
parent cbea31d1a2
commit d7b36aaf3f
8 changed files with 348 additions and 25 deletions

View file

@ -216,7 +216,12 @@ class CloudBackend:
params["folder_id"] = folder_id
data = self._request("GET", "/docs/", params=params)
return [
{"doc_id": d.get("id", ""), "doc_name": d.get("name", ""), "doc_type": "pdf"}
{
"doc_id": d.get("id", ""),
"doc_name": d.get("name", ""),
"doc_description": d.get("description", ""),
"doc_type": "pdf",
}
for d in data.get("documents", [])
]

View file

@ -197,49 +197,95 @@ class LocalBackend:
self._storage.delete_document(collection, doc_id)
def get_agent_tools(self, collection: str, doc_ids: list[str] | None = None) -> AgentTools:
"""Build agent tools.
- doc_ids=None (open mode): includes ``list_documents``; agent picks docs itself.
- doc_ids=[...] (scoped mode): no ``list_documents``; the other tools
hard-enforce the whitelist and reject out-of-scope doc_ids.
"""
from agents import function_tool
import json
storage = self._storage
col_name = collection
backend = self
filter_ids = doc_ids
scope = set(doc_ids) if doc_ids else None
@function_tool
def list_documents() -> str:
"""List all documents in the collection."""
docs = storage.list_documents(col_name)
if filter_ids:
docs = [d for d in docs if d["doc_id"] in filter_ids]
return json.dumps(docs)
def _reject(doc_id: str) -> str | None:
if scope is not None and doc_id not in scope:
return json.dumps({
"error": f"doc_id '{doc_id}' is not in scope.",
"allowed_doc_ids": sorted(scope),
})
return None
@function_tool
def get_document(doc_id: str) -> str:
"""Get document metadata."""
rejection = _reject(doc_id)
if rejection:
return rejection
return json.dumps(storage.get_document(col_name, doc_id))
@function_tool
def get_document_structure(doc_id: str) -> str:
"""Get document tree structure (without text)."""
rejection = _reject(doc_id)
if rejection:
return rejection
structure = storage.get_document_structure(col_name, doc_id)
return json.dumps(remove_fields(structure, fields=["text"]), ensure_ascii=False)
@function_tool
def get_page_content(doc_id: str, pages: str) -> str:
"""Get page content. Use tight ranges: '5-7', '3,8', '12'."""
rejection = _reject(doc_id)
if rejection:
return rejection
result = backend.get_page_content(col_name, doc_id, pages)
return json.dumps(result, ensure_ascii=False)
return AgentTools(function_tools=[list_documents, get_document, get_document_structure, get_page_content])
tools = [get_document, get_document_structure, get_page_content]
if scope is None:
@function_tool
def list_documents() -> str:
"""List all documents in the collection."""
return json.dumps(storage.list_documents(col_name))
tools.insert(0, list_documents)
return AgentTools(function_tools=tools)
def _scoped_docs(self, collection: str, doc_ids: list[str]) -> list[dict]:
"""Fetch metadata for the docs in scope; raise if any are missing."""
by_id = {d["doc_id"]: d for d in self._storage.list_documents(collection)}
missing = [did for did in doc_ids if did not in by_id]
if missing:
raise DocumentNotFoundError(
f"doc_ids not found in collection '{collection}': {missing}"
)
return [by_id[did] for did in doc_ids]
def query(self, collection: str, question: str, doc_ids: list[str] | None = None) -> str:
from ..agent import AgentRunner
from ..agent import AgentRunner, SCOPED_SYSTEM_PROMPT, wrap_with_doc_context
tools = self.get_agent_tools(collection, doc_ids)
return AgentRunner(tools=tools, model=self._retrieve_model).run(question)
instructions = None
if doc_ids:
docs = self._scoped_docs(collection, doc_ids)
question = wrap_with_doc_context(docs, question)
instructions = SCOPED_SYSTEM_PROMPT
return AgentRunner(tools=tools, model=self._retrieve_model,
instructions=instructions).run(question)
async def query_stream(self, collection: str, question: str,
doc_ids: list[str] | None = None):
from ..agent import QueryStream
from ..agent import QueryStream, SCOPED_SYSTEM_PROMPT, wrap_with_doc_context
tools = self.get_agent_tools(collection, doc_ids)
stream = QueryStream(tools=tools, question=question, model=self._retrieve_model)
instructions = None
if doc_ids:
docs = self._scoped_docs(collection, doc_ids)
question = wrap_with_doc_context(docs, question)
instructions = SCOPED_SYSTEM_PROMPT
stream = QueryStream(tools=tools, question=question,
model=self._retrieve_model, instructions=instructions)
async for event in stream:
yield event