feat(collection): scoped query mode and experimental multi-doc warning

- get_agent_tools branches on doc_ids:
  - scoped (doc_ids=[...]): drops list_documents and hard-enforces a
    whitelist on the remaining tools; system prompt switches to
    SCOPED_SYSTEM_PROMPT (no list_documents instruction); doc list +
    summaries are prepended to the user message via wrap_with_doc_context.
  - open (doc_ids=None): unchanged 4-tool agent loop.
- list_documents now exposes doc_description (sqlite + cloud).
- Collection.query emits UserWarning when doc_ids is None and the
  collection holds >1 documents; PAGEINDEX_EXPERIMENTAL_MULTIDOC=1
  silences it. Single-doc collections skip the warning; empty
  collections raise ValueError.
- Agents SDK tracing upload disabled by default (avoids SSL timeouts);
  PAGEINDEX_AGENTS_TRACING=1 re-enables it.
- README: new SDK Usage section covering local/cloud quick start,
  streaming, multi-doc as experimental, and runnable examples.
This commit is contained in:
mountain 2026-05-15 11:14:12 +08:00
parent cbea31d1a2
commit d7b36aaf3f
8 changed files with 348 additions and 25 deletions

View file

@ -1,10 +1,24 @@
# pageindex/collection.py
from __future__ import annotations
import os
import warnings
from typing import AsyncIterator
from .events import QueryEvent
from .backend.protocol import Backend
def _multidoc_acked() -> bool:
return os.getenv("PAGEINDEX_EXPERIMENTAL_MULTIDOC", "").lower() in ("1", "true", "yes")
_MULTIDOC_WARNING = (
"Querying the entire collection (no doc_ids) is experimental — selection "
"accuracy depends on auto-generated doc descriptions. Pass doc_ids=[...] "
"for reliable results, or set PAGEINDEX_EXPERIMENTAL_MULTIDOC=1 to silence "
"this warning."
)
class QueryStream:
"""Wraps backend.query_stream() as an async iterable object."""
@ -60,10 +74,23 @@ class Collection:
- stream=True: returns async iterable of QueryEvent
Usage:
answer = col.query("question")
async for event in col.query("question", stream=True):
answer = col.query("question", doc_ids=[doc_id])
async for event in col.query("question", doc_ids=[doc_id], stream=True):
...
Passing doc_ids=None queries the entire collection this is
experimental; emits a UserWarning unless PAGEINDEX_EXPERIMENTAL_MULTIDOC
is set.
"""
if doc_ids is None and not _multidoc_acked():
docs = self._backend.list_documents(self._name)
if not docs:
raise ValueError(
f"Cannot query collection '{self._name}': it is empty. "
"Add documents with col.add(...) first."
)
if len(docs) > 1:
warnings.warn(_MULTIDOC_WARNING, UserWarning, stacklevel=2)
if stream:
return QueryStream(self._backend, self._name, question, doc_ids)
return self._backend.query(self._name, question, doc_ids)