PageIndex/pageindex/collection.py

# pageindex/collection.py
from __future__ import annotations
import os
import warnings
from typing import AsyncIterator
from .events import QueryEvent
from .backend.protocol import Backend


def _multidoc_acked() -> bool:
    return os.getenv("PAGEINDEX_EXPERIMENTAL_MULTIDOC", "").lower() in ("1", "true", "yes")


_MULTIDOC_WARNING = (
    "Querying the entire collection (no doc_ids) is experimental — a naive "
    "first implementation that lets the agent pick docs from auto-generated "
    "descriptions. Better cross-document retrieval is on the way. Pass "
    "doc_ids=[...] for reliable results, or set "
    "PAGEINDEX_EXPERIMENTAL_MULTIDOC=1 to silence this warning."
)


class QueryStream:
    """Wraps backend.query_stream() as an async iterable object."""

    def __init__(self, backend: Backend, collection: str, question: str,
                 doc_ids: list[str] | None = None):
        self._backend = backend
        self._collection = collection
        self._question = question
        self._doc_ids = doc_ids

    async def stream_events(self) -> AsyncIterator[QueryEvent]:
        async for event in self._backend.query_stream(
            self._collection, self._question, self._doc_ids
        ):
            yield event

    def __aiter__(self):
        return self.stream_events()


class Collection:
    def __init__(self, name: str, backend: Backend):
        self._name = name
        self._backend = backend

    @property
    def name(self) -> str:
        return self._name

    def add(self, file_path: str) -> str:
        return self._backend.add_document(self._name, file_path)

    def list_documents(self) -> list[dict]:
        return self._backend.list_documents(self._name)

    def get_document(self, doc_id: str, include_text: bool = False) -> dict:
        return self._backend.get_document(self._name, doc_id, include_text=include_text)

    def get_document_structure(self, doc_id: str) -> list:
        return self._backend.get_document_structure(self._name, doc_id)

    def get_page_content(self, doc_id: str, pages: str) -> list:
        return self._backend.get_page_content(self._name, doc_id, pages)

    def delete_document(self, doc_id: str) -> None:
        self._backend.delete_document(self._name, doc_id)

    def query(self, question: str,
              doc_ids: str | list[str] | None = None,
              stream: bool = False) -> str | QueryStream:
        """Query documents in this collection.

        - stream=False: returns answer string (sync)
        - stream=True: returns async iterable of QueryEvent

        ``doc_ids`` can be a single doc id (``str``) or a list. ``None`` queries
        the entire collection (experimental).

        Usage:
            answer = col.query("question", doc_ids=doc_id)            # single
            answer = col.query("question", doc_ids=[d1, d2])          # multi
            async for event in col.query("question", doc_ids=doc_id, stream=True):
                ...

        Passing doc_ids=None queries the entire collection — this is
        experimental; emits a UserWarning unless PAGEINDEX_EXPERIMENTAL_MULTIDOC
        is set.
        """
        if isinstance(doc_ids, str):
            doc_ids = [doc_ids]
        elif doc_ids == []:
            raise ValueError(
                "doc_ids cannot be empty; pass None to query the whole collection"
            )
        if doc_ids is None and not _multidoc_acked():
            docs = self._backend.list_documents(self._name)
            if not docs:
                raise ValueError(
                    f"Cannot query collection '{self._name}': it is empty. "
                    "Add documents with col.add(...) first."
                )
            if len(docs) > 1:
                warnings.warn(_MULTIDOC_WARNING, UserWarning, stacklevel=2)
        if stream:
            return QueryStream(self._backend, self._name, question, doc_ids)
        return self._backend.query(self._name, question, doc_ids)
feat: add PageIndex SDK with local/cloud dual-mode support (#207) 2026-04-06 22:51:04 +08:00			`# pageindex/collection.py`
			`from __future__ import annotations`
feat(collection): scoped query mode and experimental multi-doc warning - get_agent_tools branches on doc_ids: - scoped (doc_ids=[...]): drops list_documents and hard-enforces a whitelist on the remaining tools; system prompt switches to SCOPED_SYSTEM_PROMPT (no list_documents instruction); doc list + summaries are prepended to the user message via wrap_with_doc_context. - open (doc_ids=None): unchanged 4-tool agent loop. - list_documents now exposes doc_description (sqlite + cloud). - Collection.query emits UserWarning when doc_ids is None and the collection holds >1 documents; PAGEINDEX_EXPERIMENTAL_MULTIDOC=1 silences it. Single-doc collections skip the warning; empty collections raise ValueError. - Agents SDK tracing upload disabled by default (avoids SSL timeouts); PAGEINDEX_AGENTS_TRACING=1 re-enables it. - README: new SDK Usage section covering local/cloud quick start, streaming, multi-doc as experimental, and runnable examples. 2026-05-15 11:14:12 +08:00			`import os`
			`import warnings`
feat: add PageIndex SDK with local/cloud dual-mode support (#207) 2026-04-06 22:51:04 +08:00			`from typing import AsyncIterator`
			`from .events import QueryEvent`
			`from .backend.protocol import Backend`


feat(collection): scoped query mode and experimental multi-doc warning - get_agent_tools branches on doc_ids: - scoped (doc_ids=[...]): drops list_documents and hard-enforces a whitelist on the remaining tools; system prompt switches to SCOPED_SYSTEM_PROMPT (no list_documents instruction); doc list + summaries are prepended to the user message via wrap_with_doc_context. - open (doc_ids=None): unchanged 4-tool agent loop. - list_documents now exposes doc_description (sqlite + cloud). - Collection.query emits UserWarning when doc_ids is None and the collection holds >1 documents; PAGEINDEX_EXPERIMENTAL_MULTIDOC=1 silences it. Single-doc collections skip the warning; empty collections raise ValueError. - Agents SDK tracing upload disabled by default (avoids SSL timeouts); PAGEINDEX_AGENTS_TRACING=1 re-enables it. - README: new SDK Usage section covering local/cloud quick start, streaming, multi-doc as experimental, and runnable examples. 2026-05-15 11:14:12 +08:00			`def _multidoc_acked() -> bool:`
			`return os.getenv("PAGEINDEX_EXPERIMENTAL_MULTIDOC", "").lower() in ("1", "true", "yes")`


			`_MULTIDOC_WARNING = (`
feat(collection): doc_ids accepts str\|list, design cleanups - Collection.query and Backend.query/query_stream accept doc_ids as str, list[str] or None. Single str is normalized to [str] inside each backend; bare [] is rejected with ValueError at both layers. - wrap_with_doc_context wraps the scoped doc list in <docs>...</docs> and SCOPED_SYSTEM_PROMPT instructs the agent to treat that block as data, not instructions (defense against prompt injection via auto-generated doc_description). - _require_cloud_api now distinguishes api_key="" from api_key=None; the former gives a targeted error pointing at the empty-string vs fall-back-to-local situation when legacy SDK methods are called. - Legacy PageIndexClient.list_documents docstring spells out the return-shape difference vs collection.list_documents() to flag a silent migration footgun (paginated dict with id/name keys vs plain list[dict] with doc_id/doc_name keys). - Remove dead CloudBackend.get_agent_tools stub (not on the Backend protocol; only ever returned an empty AgentTools()) and the SYSTEM_PROMPT alias (OPEN_/SCOPED_SYSTEM_PROMPT are the explicit names now). - README quick start and streaming example now pass doc_ids; new multi-document section shows both str and list forms. - examples/demo_query_modes.py exercises all five query-mode cases (single-doc, multi-doc with/without env var, scoped single, scoped multi) for manual verification. 2026-05-15 17:03:17 +08:00			`"Querying the entire collection (no doc_ids) is experimental — a naive "`
			`"first implementation that lets the agent pick docs from auto-generated "`
			`"descriptions. Better cross-document retrieval is on the way. Pass "`
			`"doc_ids=[...] for reliable results, or set "`
			`"PAGEINDEX_EXPERIMENTAL_MULTIDOC=1 to silence this warning."`
feat(collection): scoped query mode and experimental multi-doc warning - get_agent_tools branches on doc_ids: - scoped (doc_ids=[...]): drops list_documents and hard-enforces a whitelist on the remaining tools; system prompt switches to SCOPED_SYSTEM_PROMPT (no list_documents instruction); doc list + summaries are prepended to the user message via wrap_with_doc_context. - open (doc_ids=None): unchanged 4-tool agent loop. - list_documents now exposes doc_description (sqlite + cloud). - Collection.query emits UserWarning when doc_ids is None and the collection holds >1 documents; PAGEINDEX_EXPERIMENTAL_MULTIDOC=1 silences it. Single-doc collections skip the warning; empty collections raise ValueError. - Agents SDK tracing upload disabled by default (avoids SSL timeouts); PAGEINDEX_AGENTS_TRACING=1 re-enables it. - README: new SDK Usage section covering local/cloud quick start, streaming, multi-doc as experimental, and runnable examples. 2026-05-15 11:14:12 +08:00			`)`


feat: add PageIndex SDK with local/cloud dual-mode support (#207) 2026-04-06 22:51:04 +08:00			`class QueryStream:`
			`"""Wraps backend.query_stream() as an async iterable object."""`

			`def __init__(self, backend: Backend, collection: str, question: str,`
			`doc_ids: list[str] \| None = None):`
			`self._backend = backend`
			`self._collection = collection`
			`self._question = question`
			`self._doc_ids = doc_ids`

			`async def stream_events(self) -> AsyncIterator[QueryEvent]:`
			`async for event in self._backend.query_stream(`
			`self._collection, self._question, self._doc_ids`
			`):`
			`yield event`

			`def __aiter__(self):`
			`return self.stream_events()`


			`class Collection:`
			`def __init__(self, name: str, backend: Backend):`
			`self._name = name`
			`self._backend = backend`

			`@property`
			`def name(self) -> str:`
			`return self._name`

			`def add(self, file_path: str) -> str:`
			`return self._backend.add_document(self._name, file_path)`

			`def list_documents(self) -> list[dict]:`
			`return self._backend.list_documents(self._name)`

			`def get_document(self, doc_id: str, include_text: bool = False) -> dict:`
			`return self._backend.get_document(self._name, doc_id, include_text=include_text)`

			`def get_document_structure(self, doc_id: str) -> list:`
			`return self._backend.get_document_structure(self._name, doc_id)`

			`def get_page_content(self, doc_id: str, pages: str) -> list:`
			`return self._backend.get_page_content(self._name, doc_id, pages)`

			`def delete_document(self, doc_id: str) -> None:`
			`self._backend.delete_document(self._name, doc_id)`

feat(collection): doc_ids accepts str\|list, design cleanups - Collection.query and Backend.query/query_stream accept doc_ids as str, list[str] or None. Single str is normalized to [str] inside each backend; bare [] is rejected with ValueError at both layers. - wrap_with_doc_context wraps the scoped doc list in <docs>...</docs> and SCOPED_SYSTEM_PROMPT instructs the agent to treat that block as data, not instructions (defense against prompt injection via auto-generated doc_description). - _require_cloud_api now distinguishes api_key="" from api_key=None; the former gives a targeted error pointing at the empty-string vs fall-back-to-local situation when legacy SDK methods are called. - Legacy PageIndexClient.list_documents docstring spells out the return-shape difference vs collection.list_documents() to flag a silent migration footgun (paginated dict with id/name keys vs plain list[dict] with doc_id/doc_name keys). - Remove dead CloudBackend.get_agent_tools stub (not on the Backend protocol; only ever returned an empty AgentTools()) and the SYSTEM_PROMPT alias (OPEN_/SCOPED_SYSTEM_PROMPT are the explicit names now). - README quick start and streaming example now pass doc_ids; new multi-document section shows both str and list forms. - examples/demo_query_modes.py exercises all five query-mode cases (single-doc, multi-doc with/without env var, scoped single, scoped multi) for manual verification. 2026-05-15 17:03:17 +08:00			`def query(self, question: str,`
			`doc_ids: str \| list[str] \| None = None,`
feat: add PageIndex SDK with local/cloud dual-mode support (#207) 2026-04-06 22:51:04 +08:00			`stream: bool = False) -> str \| QueryStream:`
			`"""Query documents in this collection.`

			`- stream=False: returns answer string (sync)`
			`- stream=True: returns async iterable of QueryEvent`

feat(collection): doc_ids accepts str\|list, design cleanups - Collection.query and Backend.query/query_stream accept doc_ids as str, list[str] or None. Single str is normalized to [str] inside each backend; bare [] is rejected with ValueError at both layers. - wrap_with_doc_context wraps the scoped doc list in <docs>...</docs> and SCOPED_SYSTEM_PROMPT instructs the agent to treat that block as data, not instructions (defense against prompt injection via auto-generated doc_description). - _require_cloud_api now distinguishes api_key="" from api_key=None; the former gives a targeted error pointing at the empty-string vs fall-back-to-local situation when legacy SDK methods are called. - Legacy PageIndexClient.list_documents docstring spells out the return-shape difference vs collection.list_documents() to flag a silent migration footgun (paginated dict with id/name keys vs plain list[dict] with doc_id/doc_name keys). - Remove dead CloudBackend.get_agent_tools stub (not on the Backend protocol; only ever returned an empty AgentTools()) and the SYSTEM_PROMPT alias (OPEN_/SCOPED_SYSTEM_PROMPT are the explicit names now). - README quick start and streaming example now pass doc_ids; new multi-document section shows both str and list forms. - examples/demo_query_modes.py exercises all five query-mode cases (single-doc, multi-doc with/without env var, scoped single, scoped multi) for manual verification. 2026-05-15 17:03:17 +08:00			``doc_ids`` can be a single doc id (``str``) or a list. ``None`` queries
			`the entire collection (experimental).`

feat: add PageIndex SDK with local/cloud dual-mode support (#207) 2026-04-06 22:51:04 +08:00			`Usage:`
feat(collection): doc_ids accepts str\|list, design cleanups - Collection.query and Backend.query/query_stream accept doc_ids as str, list[str] or None. Single str is normalized to [str] inside each backend; bare [] is rejected with ValueError at both layers. - wrap_with_doc_context wraps the scoped doc list in <docs>...</docs> and SCOPED_SYSTEM_PROMPT instructs the agent to treat that block as data, not instructions (defense against prompt injection via auto-generated doc_description). - _require_cloud_api now distinguishes api_key="" from api_key=None; the former gives a targeted error pointing at the empty-string vs fall-back-to-local situation when legacy SDK methods are called. - Legacy PageIndexClient.list_documents docstring spells out the return-shape difference vs collection.list_documents() to flag a silent migration footgun (paginated dict with id/name keys vs plain list[dict] with doc_id/doc_name keys). - Remove dead CloudBackend.get_agent_tools stub (not on the Backend protocol; only ever returned an empty AgentTools()) and the SYSTEM_PROMPT alias (OPEN_/SCOPED_SYSTEM_PROMPT are the explicit names now). - README quick start and streaming example now pass doc_ids; new multi-document section shows both str and list forms. - examples/demo_query_modes.py exercises all five query-mode cases (single-doc, multi-doc with/without env var, scoped single, scoped multi) for manual verification. 2026-05-15 17:03:17 +08:00			`answer = col.query("question", doc_ids=doc_id) # single`
			`answer = col.query("question", doc_ids=[d1, d2]) # multi`
			`async for event in col.query("question", doc_ids=doc_id, stream=True):`
feat: add PageIndex SDK with local/cloud dual-mode support (#207) 2026-04-06 22:51:04 +08:00			`...`
feat(collection): scoped query mode and experimental multi-doc warning - get_agent_tools branches on doc_ids: - scoped (doc_ids=[...]): drops list_documents and hard-enforces a whitelist on the remaining tools; system prompt switches to SCOPED_SYSTEM_PROMPT (no list_documents instruction); doc list + summaries are prepended to the user message via wrap_with_doc_context. - open (doc_ids=None): unchanged 4-tool agent loop. - list_documents now exposes doc_description (sqlite + cloud). - Collection.query emits UserWarning when doc_ids is None and the collection holds >1 documents; PAGEINDEX_EXPERIMENTAL_MULTIDOC=1 silences it. Single-doc collections skip the warning; empty collections raise ValueError. - Agents SDK tracing upload disabled by default (avoids SSL timeouts); PAGEINDEX_AGENTS_TRACING=1 re-enables it. - README: new SDK Usage section covering local/cloud quick start, streaming, multi-doc as experimental, and runnable examples. 2026-05-15 11:14:12 +08:00
			`Passing doc_ids=None queries the entire collection — this is`
			`experimental; emits a UserWarning unless PAGEINDEX_EXPERIMENTAL_MULTIDOC`
			`is set.`
feat: add PageIndex SDK with local/cloud dual-mode support (#207) 2026-04-06 22:51:04 +08:00			`"""`
feat(collection): doc_ids accepts str\|list, design cleanups - Collection.query and Backend.query/query_stream accept doc_ids as str, list[str] or None. Single str is normalized to [str] inside each backend; bare [] is rejected with ValueError at both layers. - wrap_with_doc_context wraps the scoped doc list in <docs>...</docs> and SCOPED_SYSTEM_PROMPT instructs the agent to treat that block as data, not instructions (defense against prompt injection via auto-generated doc_description). - _require_cloud_api now distinguishes api_key="" from api_key=None; the former gives a targeted error pointing at the empty-string vs fall-back-to-local situation when legacy SDK methods are called. - Legacy PageIndexClient.list_documents docstring spells out the return-shape difference vs collection.list_documents() to flag a silent migration footgun (paginated dict with id/name keys vs plain list[dict] with doc_id/doc_name keys). - Remove dead CloudBackend.get_agent_tools stub (not on the Backend protocol; only ever returned an empty AgentTools()) and the SYSTEM_PROMPT alias (OPEN_/SCOPED_SYSTEM_PROMPT are the explicit names now). - README quick start and streaming example now pass doc_ids; new multi-document section shows both str and list forms. - examples/demo_query_modes.py exercises all five query-mode cases (single-doc, multi-doc with/without env var, scoped single, scoped multi) for manual verification. 2026-05-15 17:03:17 +08:00			`if isinstance(doc_ids, str):`
			`doc_ids = [doc_ids]`
			`elif doc_ids == []:`
			`raise ValueError(`
			`"doc_ids cannot be empty; pass None to query the whole collection"`
			`)`
feat(collection): scoped query mode and experimental multi-doc warning - get_agent_tools branches on doc_ids: - scoped (doc_ids=[...]): drops list_documents and hard-enforces a whitelist on the remaining tools; system prompt switches to SCOPED_SYSTEM_PROMPT (no list_documents instruction); doc list + summaries are prepended to the user message via wrap_with_doc_context. - open (doc_ids=None): unchanged 4-tool agent loop. - list_documents now exposes doc_description (sqlite + cloud). - Collection.query emits UserWarning when doc_ids is None and the collection holds >1 documents; PAGEINDEX_EXPERIMENTAL_MULTIDOC=1 silences it. Single-doc collections skip the warning; empty collections raise ValueError. - Agents SDK tracing upload disabled by default (avoids SSL timeouts); PAGEINDEX_AGENTS_TRACING=1 re-enables it. - README: new SDK Usage section covering local/cloud quick start, streaming, multi-doc as experimental, and runnable examples. 2026-05-15 11:14:12 +08:00			`if doc_ids is None and not _multidoc_acked():`
			`docs = self._backend.list_documents(self._name)`
			`if not docs:`
			`raise ValueError(`
			`f"Cannot query collection '{self._name}': it is empty. "`
			`"Add documents with col.add(...) first."`
			`)`
			`if len(docs) > 1:`
			`warnings.warn(_MULTIDOC_WARNING, UserWarning, stacklevel=2)`
feat: add PageIndex SDK with local/cloud dual-mode support (#207) 2026-04-06 22:51:04 +08:00			`if stream:`
			`return QueryStream(self._backend, self._name, question, doc_ids)`
			`return self._backend.query(self._name, question, doc_ids)`