mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-05-13 00:32:36 +02:00
* feat:compatible with Pageindex SDK
* corner cases fixed
* fix: mock behavior of old SDK
* fix: close streaming response and warn on empty api_key
- LegacyCloudAPI: close response in `finally` for both _stream_chat_response
variants so abandoned iterators no longer leak the TCP connection.
- PageIndexClient: emit a warning instead of silently falling back to local
when api_key is the empty string, surfacing typical env-var-unset misconfig.
- FakeResponse: add close()/closed to match the real requests.Response API.
- Add unit coverage for stream close (both paths) and the empty-api_key warning.
- Add scripts/e2e_legacy_sdk.py to smoke-test the legacy SDK contract end-to-end
against api.pageindex.ai.
* chore: mark legacy SDK methods with @deprecated and docstring pointers
- Decorate the 12 PageIndexClient cloud-SDK compat methods with
@typing_extensions.deprecated(..., category=PendingDeprecationWarning):
- IDE/type-checkers render them with a strikethrough hint
- runtime warnings stay silent by default (no spam for existing callers),
surfaceable via `python -W default::PendingDeprecationWarning`
- Add a one-line docstring on each pointing to the Collection-based equivalent.
- Promote typing-extensions to a direct dependency (was transitive via litellm).
---------
Co-authored-by: XinyanZhou <xinyanzhou@XinyanZhoudeMacBook-Pro.local>
Co-authored-by: saccharin98 <xinyanzhou938@gmail.com>
Co-authored-by: mountain <kose2livs@gmail.com>
303 lines
12 KiB
Python
303 lines
12 KiB
Python
# pageindex/client.py
|
|
from __future__ import annotations
|
|
from pathlib import Path
|
|
from typing import Any, Iterator
|
|
|
|
from typing_extensions import deprecated
|
|
|
|
from .collection import Collection
|
|
from .config import IndexConfig
|
|
from .errors import PageIndexAPIError
|
|
from .parser.protocol import DocumentParser
|
|
|
|
_LEGACY_SDK_MSG = (
|
|
"Legacy compatibility — new code should prefer the Collection-based API "
|
|
"(PageIndexClient.collection(...))."
|
|
)
|
|
_legacy_sdk = deprecated(_LEGACY_SDK_MSG, category=PendingDeprecationWarning)
|
|
|
|
|
|
def _normalize_retrieve_model(model: str) -> str:
|
|
"""Preserve supported Agents SDK prefixes and route other provider paths via LiteLLM."""
|
|
passthrough_prefixes = ("litellm/", "openai/")
|
|
if not model or "/" not in model:
|
|
return model
|
|
if model.startswith(passthrough_prefixes):
|
|
return model
|
|
return f"litellm/{model}"
|
|
|
|
|
|
class PageIndexClient:
|
|
"""PageIndex client — supports both local and cloud modes.
|
|
|
|
Args:
|
|
api_key: PageIndex cloud API key. When provided, cloud mode is used
|
|
and local-only params (model, storage_path, index_config, …) are ignored.
|
|
model: LLM model for indexing (local mode only, default: gpt-4o-2024-11-20).
|
|
retrieve_model: LLM model for agent QA (local mode only, default: same as model).
|
|
storage_path: Directory for SQLite DB and files (local mode only, default: ./.pageindex).
|
|
storage: Custom StorageEngine instance (local mode only).
|
|
index_config: Advanced indexing parameters (local mode only, optional).
|
|
Pass an IndexConfig instance or a dict. Defaults are sensible for most use cases.
|
|
|
|
Usage:
|
|
# Local mode (auto-detected when no api_key)
|
|
client = PageIndexClient(model="gpt-5.4")
|
|
|
|
# Cloud mode (auto-detected when api_key provided)
|
|
client = PageIndexClient(api_key="your-api-key")
|
|
|
|
# Or use LocalClient / CloudClient for explicit mode selection
|
|
"""
|
|
|
|
BASE_URL = "https://api.pageindex.ai"
|
|
|
|
def __init__(self, api_key: str | None = None, model: str = None,
|
|
retrieve_model: str = None, storage_path: str = None,
|
|
storage=None, index_config: IndexConfig | dict = None):
|
|
if api_key == "":
|
|
import logging
|
|
logging.getLogger(__name__).warning(
|
|
"PageIndexClient received an empty api_key; falling back to local mode. "
|
|
"Pass api_key=None to silence this warning, or provide a real key for cloud mode."
|
|
)
|
|
api_key = None
|
|
if api_key is not None:
|
|
self._init_cloud(api_key)
|
|
else:
|
|
self._init_local(model, retrieve_model, storage_path, storage, index_config)
|
|
|
|
def _init_cloud(self, api_key: str):
|
|
from .backend.cloud import CloudBackend
|
|
from .cloud_api import LegacyCloudAPI
|
|
self._backend = CloudBackend(api_key=api_key)
|
|
self._legacy_cloud_api = LegacyCloudAPI(api_key=api_key, base_url=self.BASE_URL)
|
|
|
|
def _init_local(self, model: str = None, retrieve_model: str = None,
|
|
storage_path: str = None, storage=None,
|
|
index_config: IndexConfig | dict = None):
|
|
self._legacy_cloud_api = None
|
|
|
|
# Build IndexConfig: merge model/retrieve_model with index_config
|
|
overrides = {}
|
|
if model:
|
|
overrides["model"] = model
|
|
if retrieve_model:
|
|
overrides["retrieve_model"] = retrieve_model
|
|
if isinstance(index_config, IndexConfig):
|
|
opt = index_config.model_copy(update=overrides)
|
|
elif isinstance(index_config, dict):
|
|
merged = {**index_config, **overrides} # explicit model/retrieve_model win
|
|
opt = IndexConfig(**merged)
|
|
else:
|
|
opt = IndexConfig(**overrides) if overrides else IndexConfig()
|
|
|
|
self._validate_llm_provider(opt.model)
|
|
|
|
storage_path = Path(storage_path or ".pageindex").resolve()
|
|
storage_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
from .storage.sqlite import SQLiteStorage
|
|
from .backend.local import LocalBackend
|
|
storage_engine = storage or SQLiteStorage(str(storage_path / "pageindex.db"))
|
|
self._backend = LocalBackend(
|
|
storage=storage_engine,
|
|
files_dir=str(storage_path / "files"),
|
|
model=opt.model,
|
|
retrieve_model=_normalize_retrieve_model(opt.retrieve_model or opt.model),
|
|
index_config=opt,
|
|
)
|
|
|
|
@staticmethod
|
|
def _validate_llm_provider(model: str) -> None:
|
|
"""Validate model and check API key via litellm. Warns if key seems missing."""
|
|
try:
|
|
import litellm
|
|
litellm.model_cost_map_url = ""
|
|
_, provider, _, _ = litellm.get_llm_provider(model=model)
|
|
except Exception:
|
|
return
|
|
|
|
key = litellm.get_api_key(llm_provider=provider, dynamic_api_key=None)
|
|
if not key:
|
|
import os
|
|
common_var = f"{provider.upper()}_API_KEY"
|
|
if not os.getenv(common_var):
|
|
from .errors import PageIndexError
|
|
raise PageIndexError(
|
|
f"API key not configured for provider '{provider}' (model: {model}). "
|
|
f"Set the {common_var} environment variable."
|
|
)
|
|
|
|
def collection(self, name: str = "default") -> Collection:
|
|
"""Get or create a collection. Defaults to 'default'."""
|
|
self._backend.get_or_create_collection(name)
|
|
return Collection(name=name, backend=self._backend)
|
|
|
|
def list_collections(self) -> list[str]:
|
|
return self._backend.list_collections()
|
|
|
|
def delete_collection(self, name: str) -> None:
|
|
self._backend.delete_collection(name)
|
|
|
|
def register_parser(self, parser: DocumentParser) -> None:
|
|
"""Register a custom document parser. Only available in local mode."""
|
|
if not hasattr(self._backend, 'register_parser'):
|
|
from .errors import PageIndexError
|
|
raise PageIndexError("Custom parsers are not supported in cloud mode")
|
|
self._backend.register_parser(parser)
|
|
|
|
def _require_cloud_api(self):
|
|
if self._legacy_cloud_api is None:
|
|
from .errors import PageIndexAPIError
|
|
raise PageIndexAPIError(
|
|
"This method is part of the pageindex 0.2.x cloud SDK API. "
|
|
"Initialize with api_key to use it."
|
|
)
|
|
return self._legacy_cloud_api
|
|
|
|
# ── pageindex 0.2.x cloud SDK compatibility (prefer Collection API for new code) ──
|
|
@_legacy_sdk
|
|
def submit_document(
|
|
self,
|
|
file_path: str,
|
|
mode: str | None = None,
|
|
beta_headers: list[str] | None = None,
|
|
folder_id: str | None = None,
|
|
) -> dict[str, Any]:
|
|
"""Legacy SDK compatibility — prefer ``client.collection(...).add(path)``."""
|
|
return self._require_cloud_api().submit_document(
|
|
file_path=file_path,
|
|
mode=mode,
|
|
beta_headers=beta_headers,
|
|
folder_id=folder_id,
|
|
)
|
|
|
|
@_legacy_sdk
|
|
def get_ocr(self, doc_id: str, format: str = "page") -> dict[str, Any]:
|
|
"""Legacy SDK compatibility — prefer ``collection.get_page_content(doc_id, pages)``."""
|
|
return self._require_cloud_api().get_ocr(doc_id=doc_id, format=format)
|
|
|
|
@_legacy_sdk
|
|
def get_tree(self, doc_id: str, node_summary: bool = False) -> dict[str, Any]:
|
|
"""Legacy SDK compatibility — prefer ``collection.get_document_structure(doc_id)``."""
|
|
return self._require_cloud_api().get_tree(doc_id=doc_id, node_summary=node_summary)
|
|
|
|
@_legacy_sdk
|
|
def is_retrieval_ready(self, doc_id: str) -> bool:
|
|
"""Legacy SDK compatibility — Collection API handles readiness internally."""
|
|
return self._require_cloud_api().is_retrieval_ready(doc_id=doc_id)
|
|
|
|
@_legacy_sdk
|
|
def submit_query(self, doc_id: str, query: str, thinking: bool = False) -> dict[str, Any]:
|
|
"""Legacy SDK compatibility — prefer ``collection.query(question, doc_ids=[doc_id])``."""
|
|
return self._require_cloud_api().submit_query(
|
|
doc_id=doc_id,
|
|
query=query,
|
|
thinking=thinking,
|
|
)
|
|
|
|
@_legacy_sdk
|
|
def get_retrieval(self, retrieval_id: str) -> dict[str, Any]:
|
|
"""Legacy SDK compatibility — Collection API returns answers synchronously."""
|
|
return self._require_cloud_api().get_retrieval(retrieval_id=retrieval_id)
|
|
|
|
@_legacy_sdk
|
|
def chat_completions(
|
|
self,
|
|
messages: list[dict[str, str]],
|
|
stream: bool = False,
|
|
doc_id: str | list[str] | None = None,
|
|
temperature: float | None = None,
|
|
stream_metadata: bool = False,
|
|
enable_citations: bool = False,
|
|
) -> dict[str, Any] | Iterator[str] | Iterator[dict[str, Any]]:
|
|
"""Legacy SDK compatibility — prefer ``collection.query(...)``."""
|
|
return self._require_cloud_api().chat_completions(
|
|
messages=messages,
|
|
stream=stream,
|
|
doc_id=doc_id,
|
|
temperature=temperature,
|
|
stream_metadata=stream_metadata,
|
|
enable_citations=enable_citations,
|
|
)
|
|
|
|
@_legacy_sdk
|
|
def get_document(self, doc_id: str) -> dict[str, Any]:
|
|
"""Legacy SDK compatibility — prefer ``collection.get_document(doc_id)``."""
|
|
return self._require_cloud_api().get_document(doc_id=doc_id)
|
|
|
|
@_legacy_sdk
|
|
def delete_document(self, doc_id: str) -> dict[str, Any]:
|
|
"""Legacy SDK compatibility — prefer ``collection.delete_document(doc_id)``."""
|
|
return self._require_cloud_api().delete_document(doc_id=doc_id)
|
|
|
|
@_legacy_sdk
|
|
def list_documents(
|
|
self,
|
|
limit: int = 50,
|
|
offset: int = 0,
|
|
folder_id: str | None = None,
|
|
) -> dict[str, Any]:
|
|
"""Legacy SDK compatibility — prefer ``collection.list_documents()``."""
|
|
return self._require_cloud_api().list_documents(
|
|
limit=limit,
|
|
offset=offset,
|
|
folder_id=folder_id,
|
|
)
|
|
|
|
@_legacy_sdk
|
|
def create_folder(
|
|
self,
|
|
name: str,
|
|
description: str | None = None,
|
|
parent_folder_id: str | None = None,
|
|
) -> dict[str, Any]:
|
|
"""Legacy SDK compatibility — prefer ``client.collection(name)`` (auto-creates)."""
|
|
return self._require_cloud_api().create_folder(
|
|
name=name,
|
|
description=description,
|
|
parent_folder_id=parent_folder_id,
|
|
)
|
|
|
|
@_legacy_sdk
|
|
def list_folders(self, parent_folder_id: str | None = None) -> dict[str, Any]:
|
|
"""Legacy SDK compatibility — prefer ``client.list_collections()``."""
|
|
return self._require_cloud_api().list_folders(parent_folder_id=parent_folder_id)
|
|
|
|
|
|
class LocalClient(PageIndexClient):
|
|
"""Local mode — indexes and queries documents on your machine.
|
|
|
|
Args:
|
|
model: LLM model for indexing (default: gpt-4o-2024-11-20)
|
|
retrieve_model: LLM model for agent QA (default: same as model)
|
|
storage_path: Directory for SQLite DB and files (default: ./.pageindex)
|
|
storage: Custom StorageEngine instance (default: SQLiteStorage)
|
|
index_config: Advanced indexing parameters. Pass an IndexConfig instance
|
|
or a dict. All fields have sensible defaults — most users don't need this.
|
|
|
|
Example::
|
|
|
|
# Simple — defaults are fine
|
|
client = LocalClient(model="gpt-5.4")
|
|
|
|
# Advanced — tune indexing parameters
|
|
from pageindex.config import IndexConfig
|
|
client = LocalClient(
|
|
model="gpt-5.4",
|
|
index_config=IndexConfig(toc_check_page_num=30),
|
|
)
|
|
"""
|
|
|
|
def __init__(self, model: str = None, retrieve_model: str = None,
|
|
storage_path: str = None, storage=None,
|
|
index_config: IndexConfig | dict = None):
|
|
self._init_local(model, retrieve_model, storage_path, storage, index_config)
|
|
|
|
|
|
class CloudClient(PageIndexClient):
|
|
"""Cloud mode — fully managed by PageIndex cloud service. No LLM key needed."""
|
|
|
|
def __init__(self, api_key: str):
|
|
self._init_cloud(api_key)
|