feat(collection): scoped query mode and experimental multi-doc warning

- get_agent_tools branches on doc_ids:
  - scoped (doc_ids=[...]): drops list_documents and hard-enforces a
    whitelist on the remaining tools; system prompt switches to
    SCOPED_SYSTEM_PROMPT (no list_documents instruction); doc list +
    summaries are prepended to the user message via wrap_with_doc_context.
  - open (doc_ids=None): unchanged 4-tool agent loop.
- list_documents now exposes doc_description (sqlite + cloud).
- Collection.query emits UserWarning when doc_ids is None and the
  collection holds >1 documents; PAGEINDEX_EXPERIMENTAL_MULTIDOC=1
  silences it. Single-doc collections skip the warning; empty
  collections raise ValueError.
- Agents SDK tracing upload disabled by default (avoids SSL timeouts);
  PAGEINDEX_AGENTS_TRACING=1 re-enables it.
- README: new SDK Usage section covering local/cloud quick start,
  streaming, multi-doc as experimental, and runnable examples.
This commit is contained in:
mountain 2026-05-15 11:14:12 +08:00
parent cbea31d1a2
commit d7b36aaf3f
8 changed files with 348 additions and 25 deletions

View file

@ -1,14 +1,25 @@
# pageindex/agent.py
from __future__ import annotations
import os
from typing import AsyncIterator
from .events import QueryEvent
from .backend.protocol import AgentTools
# Disable Agents SDK tracing upload by default — it posts to OpenAI's tracing
# endpoint and can fail with SSL timeouts in restricted networks. Opt back in
# with PAGEINDEX_AGENTS_TRACING=1.
if os.getenv("PAGEINDEX_AGENTS_TRACING", "").lower() not in ("1", "true", "yes"):
try:
from agents import set_tracing_disabled
set_tracing_disabled(True)
except ImportError:
pass
SYSTEM_PROMPT = """
OPEN_SYSTEM_PROMPT = """
You are PageIndex, a document QA assistant.
TOOL USE:
- Call list_documents() to see available documents.
- Call list_documents() to see available documents; use doc_name and doc_description to pick which doc(s) are relevant.
- Call get_document(doc_id) to confirm status and page/line count.
- Call get_document_structure(doc_id) to identify relevant page ranges.
- Call get_page_content(doc_id, pages="5-7") with tight ranges; never fetch the whole document.
@ -19,6 +30,42 @@ IMAGES:
Answer based only on tool output. Be concise.
"""
SCOPED_SYSTEM_PROMPT = """
You are PageIndex, a document QA assistant.
TOOL USE:
- Call get_document(doc_id) to confirm status and page/line count.
- Call get_document_structure(doc_id) to identify relevant page ranges.
- Call get_page_content(doc_id, pages="5-7") with tight ranges; never fetch the whole document.
- Before each tool call, output one short sentence explaining the reason.
IMAGES:
- Page content may contain image references like ![image](path). Always preserve these in your answer so the downstream UI can render them.
- Place images near the relevant context in your answer.
Answer based only on tool output. Be concise.
"""
def wrap_with_doc_context(docs: list[dict], question: str) -> str:
"""Prepend a doc-context block to the user question for scoped queries."""
lines = []
for d in docs:
line = f"- {d['doc_id']}: {d.get('doc_name', '')}"
desc = d.get("doc_description") or ""
if desc:
line += f"{desc}"
lines.append(line)
label = "document" if len(docs) == 1 else "documents"
return (
f"The user has specified the following {label}:\n"
+ "\n".join(lines)
+ f"\n\nUse the doc_id(s) above directly with get_document_structure() "
f"and get_page_content() — do not look for other documents.\n\n"
f"User question: {question}"
)
# Backwards-compatible alias (open mode is the historical default).
SYSTEM_PROMPT = OPEN_SYSTEM_PROMPT
class QueryStream:
"""Streaming query result, similar to OpenAI's RunResultStreaming.
@ -30,12 +77,13 @@ class QueryStream:
print(event.data, end="", flush=True)
"""
def __init__(self, tools: AgentTools, question: str, model: str = None):
def __init__(self, tools: AgentTools, question: str, model: str = None,
instructions: str | None = None):
from agents import Agent
from agents.model_settings import ModelSettings
self._agent = Agent(
name="PageIndex",
instructions=SYSTEM_PROMPT,
instructions=instructions or OPEN_SYSTEM_PROMPT,
tools=tools.function_tools,
mcp_servers=tools.mcp_servers,
model=model,
@ -73,9 +121,11 @@ class QueryStream:
class AgentRunner:
def __init__(self, tools: AgentTools, model: str = None):
def __init__(self, tools: AgentTools, model: str = None,
instructions: str | None = None):
self._tools = tools
self._model = model
self._instructions = instructions or OPEN_SYSTEM_PROMPT
def run(self, question: str) -> str:
"""Sync non-streaming query. Returns answer string."""
@ -83,7 +133,7 @@ class AgentRunner:
from agents.model_settings import ModelSettings
agent = Agent(
name="PageIndex",
instructions=SYSTEM_PROMPT,
instructions=self._instructions,
tools=self._tools.function_tools,
mcp_servers=self._tools.mcp_servers,
model=self._model,

View file

@ -216,7 +216,12 @@ class CloudBackend:
params["folder_id"] = folder_id
data = self._request("GET", "/docs/", params=params)
return [
{"doc_id": d.get("id", ""), "doc_name": d.get("name", ""), "doc_type": "pdf"}
{
"doc_id": d.get("id", ""),
"doc_name": d.get("name", ""),
"doc_description": d.get("description", ""),
"doc_type": "pdf",
}
for d in data.get("documents", [])
]

View file

@ -197,49 +197,95 @@ class LocalBackend:
self._storage.delete_document(collection, doc_id)
def get_agent_tools(self, collection: str, doc_ids: list[str] | None = None) -> AgentTools:
"""Build agent tools.
- doc_ids=None (open mode): includes ``list_documents``; agent picks docs itself.
- doc_ids=[...] (scoped mode): no ``list_documents``; the other tools
hard-enforce the whitelist and reject out-of-scope doc_ids.
"""
from agents import function_tool
import json
storage = self._storage
col_name = collection
backend = self
filter_ids = doc_ids
scope = set(doc_ids) if doc_ids else None
@function_tool
def list_documents() -> str:
"""List all documents in the collection."""
docs = storage.list_documents(col_name)
if filter_ids:
docs = [d for d in docs if d["doc_id"] in filter_ids]
return json.dumps(docs)
def _reject(doc_id: str) -> str | None:
if scope is not None and doc_id not in scope:
return json.dumps({
"error": f"doc_id '{doc_id}' is not in scope.",
"allowed_doc_ids": sorted(scope),
})
return None
@function_tool
def get_document(doc_id: str) -> str:
"""Get document metadata."""
rejection = _reject(doc_id)
if rejection:
return rejection
return json.dumps(storage.get_document(col_name, doc_id))
@function_tool
def get_document_structure(doc_id: str) -> str:
"""Get document tree structure (without text)."""
rejection = _reject(doc_id)
if rejection:
return rejection
structure = storage.get_document_structure(col_name, doc_id)
return json.dumps(remove_fields(structure, fields=["text"]), ensure_ascii=False)
@function_tool
def get_page_content(doc_id: str, pages: str) -> str:
"""Get page content. Use tight ranges: '5-7', '3,8', '12'."""
rejection = _reject(doc_id)
if rejection:
return rejection
result = backend.get_page_content(col_name, doc_id, pages)
return json.dumps(result, ensure_ascii=False)
return AgentTools(function_tools=[list_documents, get_document, get_document_structure, get_page_content])
tools = [get_document, get_document_structure, get_page_content]
if scope is None:
@function_tool
def list_documents() -> str:
"""List all documents in the collection."""
return json.dumps(storage.list_documents(col_name))
tools.insert(0, list_documents)
return AgentTools(function_tools=tools)
def _scoped_docs(self, collection: str, doc_ids: list[str]) -> list[dict]:
"""Fetch metadata for the docs in scope; raise if any are missing."""
by_id = {d["doc_id"]: d for d in self._storage.list_documents(collection)}
missing = [did for did in doc_ids if did not in by_id]
if missing:
raise DocumentNotFoundError(
f"doc_ids not found in collection '{collection}': {missing}"
)
return [by_id[did] for did in doc_ids]
def query(self, collection: str, question: str, doc_ids: list[str] | None = None) -> str:
from ..agent import AgentRunner
from ..agent import AgentRunner, SCOPED_SYSTEM_PROMPT, wrap_with_doc_context
tools = self.get_agent_tools(collection, doc_ids)
return AgentRunner(tools=tools, model=self._retrieve_model).run(question)
instructions = None
if doc_ids:
docs = self._scoped_docs(collection, doc_ids)
question = wrap_with_doc_context(docs, question)
instructions = SCOPED_SYSTEM_PROMPT
return AgentRunner(tools=tools, model=self._retrieve_model,
instructions=instructions).run(question)
async def query_stream(self, collection: str, question: str,
doc_ids: list[str] | None = None):
from ..agent import QueryStream
from ..agent import QueryStream, SCOPED_SYSTEM_PROMPT, wrap_with_doc_context
tools = self.get_agent_tools(collection, doc_ids)
stream = QueryStream(tools=tools, question=question, model=self._retrieve_model)
instructions = None
if doc_ids:
docs = self._scoped_docs(collection, doc_ids)
question = wrap_with_doc_context(docs, question)
instructions = SCOPED_SYSTEM_PROMPT
stream = QueryStream(tools=tools, question=question,
model=self._retrieve_model, instructions=instructions)
async for event in stream:
yield event

View file

@ -1,10 +1,24 @@
# pageindex/collection.py
from __future__ import annotations
import os
import warnings
from typing import AsyncIterator
from .events import QueryEvent
from .backend.protocol import Backend
def _multidoc_acked() -> bool:
return os.getenv("PAGEINDEX_EXPERIMENTAL_MULTIDOC", "").lower() in ("1", "true", "yes")
_MULTIDOC_WARNING = (
"Querying the entire collection (no doc_ids) is experimental — selection "
"accuracy depends on auto-generated doc descriptions. Pass doc_ids=[...] "
"for reliable results, or set PAGEINDEX_EXPERIMENTAL_MULTIDOC=1 to silence "
"this warning."
)
class QueryStream:
"""Wraps backend.query_stream() as an async iterable object."""
@ -60,10 +74,23 @@ class Collection:
- stream=True: returns async iterable of QueryEvent
Usage:
answer = col.query("question")
async for event in col.query("question", stream=True):
answer = col.query("question", doc_ids=[doc_id])
async for event in col.query("question", doc_ids=[doc_id], stream=True):
...
Passing doc_ids=None queries the entire collection this is
experimental; emits a UserWarning unless PAGEINDEX_EXPERIMENTAL_MULTIDOC
is set.
"""
if doc_ids is None and not _multidoc_acked():
docs = self._backend.list_documents(self._name)
if not docs:
raise ValueError(
f"Cannot query collection '{self._name}': it is empty. "
"Add documents with col.add(...) first."
)
if len(docs) > 1:
warnings.warn(_MULTIDOC_WARNING, UserWarning, stacklevel=2)
if stream:
return QueryStream(self._backend, self._name, question, doc_ids)
return self._backend.query(self._name, question, doc_ids)

View file

@ -125,10 +125,10 @@ class SQLiteStorage:
def list_documents(self, collection: str) -> list[dict]:
conn = self._get_conn()
rows = conn.execute(
"SELECT doc_id, doc_name, doc_type FROM documents WHERE collection_name = ? ORDER BY created_at",
"SELECT doc_id, doc_name, doc_description, doc_type FROM documents WHERE collection_name = ? ORDER BY created_at",
(collection,),
).fetchall()
return [{"doc_id": r[0], "doc_name": r[1], "doc_type": r[2]} for r in rows]
return [{"doc_id": r[0], "doc_name": r[1], "doc_description": r[2] or "", "doc_type": r[3]} for r in rows]
def delete_document(self, collection: str, doc_id: str) -> None:
conn = self._get_conn()