feat:compatible with Pageindex SDK (#238)

* feat:compatible with Pageindex SDK * corner cases fixed * fix: mock behavior of old SDK * fix: close streaming response and warn on empty api_key - LegacyCloudAPI: close response in `finally` for both _stream_chat_response variants so abandoned iterators no longer leak the TCP connection. - PageIndexClient: emit a warning instead of silently falling back to local when api_key is the empty string, surfacing typical env-var-unset misconfig. - FakeResponse: add close()/closed to match the real requests.Response API. - Add unit coverage for stream close (both paths) and the empty-api_key warning. - Add scripts/e2e_legacy_sdk.py to smoke-test the legacy SDK contract end-to-end against api.pageindex.ai. * chore: mark legacy SDK methods with @deprecated and docstring pointers - Decorate the 12 PageIndexClient cloud-SDK compat methods with @typing_extensions.deprecated(..., category=PendingDeprecationWarning): - IDE/type-checkers render them with a strikethrough hint - runtime warnings stay silent by default (no spam for existing callers), surfaceable via `python -W default::PendingDeprecationWarning` - Add a one-line docstring on each pointing to the Collection-based equivalent. - Promote typing-extensions to a direct dependency (was transitive via litellm). --------- Co-authored-by: XinyanZhou <xinyanzhou@XinyanZhoudeMacBook-Pro.local> Co-authored-by: saccharin98 <xinyanzhou938@gmail.com> Co-authored-by: mountain <kose2livs@gmail.com>
2026-05-13 08:42:37 +02:00 · 2026-05-11 21:06:23 +08:00 · 2026-05-11 21:06:23 +08:00 · 595895cf28
commit 595895cf28
parent 6d29886892
10 changed files with 1030 additions and 20 deletions
--- a/pageindex/init.py
+++ b/pageindex/init.py
@ -13,6 +13,7 @@ from .storage.protocol import StorageEngine
 from .events import QueryEvent
 from .errors import (
    PageIndexError,
+    PageIndexAPIError,
    CollectionNotFoundError,
    DocumentNotFoundError,
    IndexingError,
@ -32,6 +33,7 @@ __all__ = [
    "StorageEngine",
    "QueryEvent",
    "PageIndexError",
+    "PageIndexAPIError",
    "CollectionNotFoundError",
    "DocumentNotFoundError",
    "IndexingError",
--- a/pageindex/client.py
+++ b/pageindex/client.py
@ -1,10 +1,21 @@
 # pageindex/client.py
 from __future__ import annotations
 from pathlib import Path
+from typing import Any, Iterator
+
+from typing_extensions import deprecated
+
 from .collection import Collection
 from .config import IndexConfig
+from .errors import PageIndexAPIError
 from .parser.protocol import DocumentParser

+_LEGACY_SDK_MSG = (
+    "Legacy compatibility — new code should prefer the Collection-based API "
+    "(PageIndexClient.collection(...))."
+)
+_legacy_sdk = deprecated(_LEGACY_SDK_MSG, category=PendingDeprecationWarning)
+

 def _normalize_retrieve_model(model: str) -> str:
    """Preserve supported Agents SDK prefixes and route other provider paths via LiteLLM."""
@ -39,21 +50,34 @@ class PageIndexClient:
        # Or use LocalClient / CloudClient for explicit mode selection
    """

-    def __init__(self, api_key: str = None, model: str = None,
+    BASE_URL = "https://api.pageindex.ai"
+
+    def __init__(self, api_key: str | None = None, model: str = None,
                 retrieve_model: str = None, storage_path: str = None,
                 storage=None, index_config: IndexConfig | dict = None):
-        if api_key:
+        if api_key == "":
+            import logging
+            logging.getLogger(__name__).warning(
+                "PageIndexClient received an empty api_key; falling back to local mode. "
+                "Pass api_key=None to silence this warning, or provide a real key for cloud mode."
+            )
+            api_key = None
+        if api_key is not None:
            self._init_cloud(api_key)
        else:
            self._init_local(model, retrieve_model, storage_path, storage, index_config)

    def _init_cloud(self, api_key: str):
        from .backend.cloud import CloudBackend
+        from .cloud_api import LegacyCloudAPI
        self._backend = CloudBackend(api_key=api_key)
+        self._legacy_cloud_api = LegacyCloudAPI(api_key=api_key, base_url=self.BASE_URL)

    def _init_local(self, model: str = None, retrieve_model: str = None,
                    storage_path: str = None, storage=None,
                    index_config: IndexConfig | dict = None):
+        self._legacy_cloud_api = None
+
        # Build IndexConfig: merge model/retrieve_model with index_config
        overrides = {}
        if model:
@ -123,6 +147,124 @@ class PageIndexClient:
            raise PageIndexError("Custom parsers are not supported in cloud mode")
        self._backend.register_parser(parser)

+    def _require_cloud_api(self):
+        if self._legacy_cloud_api is None:
+            from .errors import PageIndexAPIError
+            raise PageIndexAPIError(
+                "This method is part of the pageindex 0.2.x cloud SDK API. "
+                "Initialize with api_key to use it."
+            )
+        return self._legacy_cloud_api
+
+    # ── pageindex 0.2.x cloud SDK compatibility (prefer Collection API for new code) ──
+    @_legacy_sdk
+    def submit_document(
+        self,
+        file_path: str,
+        mode: str | None = None,
+        beta_headers: list[str] | None = None,
+        folder_id: str | None = None,
+    ) -> dict[str, Any]:
+        """Legacy SDK compatibility — prefer ``client.collection(...).add(path)``."""
+        return self._require_cloud_api().submit_document(
+            file_path=file_path,
+            mode=mode,
+            beta_headers=beta_headers,
+            folder_id=folder_id,
+        )
+
+    @_legacy_sdk
+    def get_ocr(self, doc_id: str, format: str = "page") -> dict[str, Any]:
+        """Legacy SDK compatibility — prefer ``collection.get_page_content(doc_id, pages)``."""
+        return self._require_cloud_api().get_ocr(doc_id=doc_id, format=format)
+
+    @_legacy_sdk
+    def get_tree(self, doc_id: str, node_summary: bool = False) -> dict[str, Any]:
+        """Legacy SDK compatibility — prefer ``collection.get_document_structure(doc_id)``."""
+        return self._require_cloud_api().get_tree(doc_id=doc_id, node_summary=node_summary)
+
+    @_legacy_sdk
+    def is_retrieval_ready(self, doc_id: str) -> bool:
+        """Legacy SDK compatibility — Collection API handles readiness internally."""
+        return self._require_cloud_api().is_retrieval_ready(doc_id=doc_id)
+
+    @_legacy_sdk
+    def submit_query(self, doc_id: str, query: str, thinking: bool = False) -> dict[str, Any]:
+        """Legacy SDK compatibility — prefer ``collection.query(question, doc_ids=[doc_id])``."""
+        return self._require_cloud_api().submit_query(
+            doc_id=doc_id,
+            query=query,
+            thinking=thinking,
+        )
+
+    @_legacy_sdk
+    def get_retrieval(self, retrieval_id: str) -> dict[str, Any]:
+        """Legacy SDK compatibility — Collection API returns answers synchronously."""
+        return self._require_cloud_api().get_retrieval(retrieval_id=retrieval_id)
+
+    @_legacy_sdk
+    def chat_completions(
+        self,
+        messages: list[dict[str, str]],
+        stream: bool = False,
+        doc_id: str | list[str] | None = None,
+        temperature: float | None = None,
+        stream_metadata: bool = False,
+        enable_citations: bool = False,
+    ) -> dict[str, Any] | Iterator[str] | Iterator[dict[str, Any]]:
+        """Legacy SDK compatibility — prefer ``collection.query(...)``."""
+        return self._require_cloud_api().chat_completions(
+            messages=messages,
+            stream=stream,
+            doc_id=doc_id,
+            temperature=temperature,
+            stream_metadata=stream_metadata,
+            enable_citations=enable_citations,
+        )
+
+    @_legacy_sdk
+    def get_document(self, doc_id: str) -> dict[str, Any]:
+        """Legacy SDK compatibility — prefer ``collection.get_document(doc_id)``."""
+        return self._require_cloud_api().get_document(doc_id=doc_id)
+
+    @_legacy_sdk
+    def delete_document(self, doc_id: str) -> dict[str, Any]:
+        """Legacy SDK compatibility — prefer ``collection.delete_document(doc_id)``."""
+        return self._require_cloud_api().delete_document(doc_id=doc_id)
+
+    @_legacy_sdk
+    def list_documents(
+        self,
+        limit: int = 50,
+        offset: int = 0,
+        folder_id: str | None = None,
+    ) -> dict[str, Any]:
+        """Legacy SDK compatibility — prefer ``collection.list_documents()``."""
+        return self._require_cloud_api().list_documents(
+            limit=limit,
+            offset=offset,
+            folder_id=folder_id,
+        )
+
+    @_legacy_sdk
+    def create_folder(
+        self,
+        name: str,
+        description: str | None = None,
+        parent_folder_id: str | None = None,
+    ) -> dict[str, Any]:
+        """Legacy SDK compatibility — prefer ``client.collection(name)`` (auto-creates)."""
+        return self._require_cloud_api().create_folder(
+            name=name,
+            description=description,
+            parent_folder_id=parent_folder_id,
+        )
+
+    @_legacy_sdk
+    def list_folders(self, parent_folder_id: str | None = None) -> dict[str, Any]:
+        """Legacy SDK compatibility — prefer ``client.list_collections()``."""
+        return self._require_cloud_api().list_folders(parent_folder_id=parent_folder_id)
+

 class LocalClient(PageIndexClient):
    """Local mode — indexes and queries documents on your machine.
--- a/pageindex/cloud_api.py
+++ b/pageindex/cloud_api.py
@ -0,0 +1,265 @@
+from __future__ import annotations
+
+import json
+from typing import Any, Iterator
+
+import requests
+
+from .errors import PageIndexAPIError
+
+
+class LegacyCloudAPI:
+    """Compatibility layer for the pageindex 0.2.x cloud SDK API."""
+
+    BASE_URL = "https://api.pageindex.ai"
+
+    def __init__(self, api_key: str, base_url: str | None = None):
+        self.api_key = api_key
+        self.base_url = base_url or self.BASE_URL
+
+    def _headers(self) -> dict[str, str]:
+        return {"api_key": self.api_key}
+
+    def _request(self, method: str, path: str, error_prefix: str, **kwargs) -> requests.Response:
+        try:
+            response = requests.request(
+                method,
+                f"{self.base_url}{path}",
+                headers=self._headers(),
+                **kwargs,
+            )
+        except requests.RequestException as e:
+            raise PageIndexAPIError(f"{error_prefix}: {e}") from e
+
+        if response.status_code != 200:
+            raise PageIndexAPIError(f"{error_prefix}: {response.text}")
+        return response
+
+    def submit_document(
+        self,
+        file_path: str,
+        mode: str | None = None,
+        beta_headers: list[str] | None = None,
+        folder_id: str | None = None,
+    ) -> dict[str, Any]:
+        data: dict[str, Any] = {"if_retrieval": True}
+        if mode is not None:
+            data["mode"] = mode
+        if beta_headers is not None:
+            data["beta_headers"] = json.dumps(beta_headers)
+        if folder_id is not None:
+            data["folder_id"] = folder_id
+
+        with open(file_path, "rb") as f:
+            response = self._request(
+                "POST",
+                "/doc/",
+                "Failed to submit document",
+                files={"file": f},
+                data=data,
+            )
+
+        return response.json()
+
+    def get_ocr(self, doc_id: str, format: str = "page") -> dict[str, Any]:
+        if format not in ["page", "node", "raw"]:
+            raise ValueError("Format parameter must be 'page', 'node', or 'raw'")
+
+        response = self._request(
+            "GET",
+            f"/doc/{doc_id}/?type=ocr&format={format}",
+            "Failed to get OCR result",
+        )
+        return response.json()
+
+    def get_tree(self, doc_id: str, node_summary: bool = False) -> dict[str, Any]:
+        response = self._request(
+            "GET",
+            f"/doc/{doc_id}/?type=tree&summary={node_summary}",
+            "Failed to get tree result",
+        )
+        return response.json()
+
+    def is_retrieval_ready(self, doc_id: str) -> bool:
+        try:
+            result = self.get_tree(doc_id)
+            return result.get("retrieval_ready", False)
+        except PageIndexAPIError:
+            return False
+
+    def submit_query(self, doc_id: str, query: str, thinking: bool = False) -> dict[str, Any]:
+        payload = {
+            "doc_id": doc_id,
+            "query": query,
+            "thinking": thinking,
+        }
+        response = self._request(
+            "POST",
+            "/retrieval/",
+            "Failed to submit retrieval",
+            json=payload,
+        )
+        return response.json()
+
+    def get_retrieval(self, retrieval_id: str) -> dict[str, Any]:
+        response = self._request(
+            "GET",
+            f"/retrieval/{retrieval_id}/",
+            "Failed to get retrieval result",
+        )
+        return response.json()
+
+    def chat_completions(
+        self,
+        messages: list[dict[str, str]],
+        stream: bool = False,
+        doc_id: str | list[str] | None = None,
+        temperature: float | None = None,
+        stream_metadata: bool = False,
+        enable_citations: bool = False,
+    ) -> dict[str, Any] | Iterator[str] | Iterator[dict[str, Any]]:
+        payload: dict[str, Any] = {
+            "messages": messages,
+            "stream": stream,
+        }
+
+        if doc_id is not None:
+            payload["doc_id"] = doc_id
+        if temperature is not None:
+            payload["temperature"] = temperature
+        if enable_citations:
+            payload["enable_citations"] = enable_citations
+
+        response = self._request(
+            "POST",
+            "/chat/completions/",
+            "Failed to get chat completion",
+            json=payload,
+            stream=stream,
+        )
+
+        if stream:
+            if stream_metadata:
+                return self._stream_chat_response_raw(response)
+            return self._stream_chat_response(response)
+        return response.json()
+
+    def _stream_chat_response(self, response: requests.Response) -> Iterator[str]:
+        try:
+            for line in response.iter_lines():
+                if not line:
+                    continue
+                line = line.decode("utf-8")
+                if not line.startswith("data: "):
+                    continue
+                data = line[6:]
+                if data == "[DONE]":
+                    break
+
+                try:
+                    chunk = json.loads(data)
+                except json.JSONDecodeError:
+                    continue
+                choices = chunk.get("choices") or []
+                if not choices:
+                    continue
+                content = choices[0].get("delta", {}).get("content", "")
+                if content:
+                    yield content
+        except requests.RequestException as e:
+            raise PageIndexAPIError(f"Failed to stream chat completion: {e}") from e
+        finally:
+            response.close()
+
+    def _stream_chat_response_raw(self, response: requests.Response) -> Iterator[dict[str, Any]]:
+        try:
+            for line in response.iter_lines():
+                if not line:
+                    continue
+                line = line.decode("utf-8")
+                if not line.startswith("data: "):
+                    continue
+                data = line[6:]
+                if data == "[DONE]":
+                    break
+
+                try:
+                    yield json.loads(data)
+                except json.JSONDecodeError:
+                    continue
+        except requests.RequestException as e:
+            raise PageIndexAPIError(f"Failed to stream chat completion: {e}") from e
+        finally:
+            response.close()
+
+    def get_document(self, doc_id: str) -> dict[str, Any]:
+        response = self._request(
+            "GET",
+            f"/doc/{doc_id}/metadata/",
+            "Failed to get document metadata",
+        )
+        return response.json()
+
+    def delete_document(self, doc_id: str) -> dict[str, Any]:
+        response = self._request(
+            "DELETE",
+            f"/doc/{doc_id}/",
+            "Failed to delete document",
+        )
+        return response.json()
+
+    def list_documents(
+        self,
+        limit: int = 50,
+        offset: int = 0,
+        folder_id: str | None = None,
+    ) -> dict[str, Any]:
+        if limit < 1 or limit > 100:
+            raise ValueError("limit must be between 1 and 100")
+        if offset < 0:
+            raise ValueError("offset must be non-negative")
+
+        params: dict[str, Any] = {"limit": limit, "offset": offset}
+        if folder_id is not None:
+            params["folder_id"] = folder_id
+
+        response = self._request(
+            "GET",
+            "/docs/",
+            "Failed to list documents",
+            params=params,
+        )
+        return response.json()
+
+    def create_folder(
+        self,
+        name: str,
+        description: str | None = None,
+        parent_folder_id: str | None = None,
+    ) -> dict[str, Any]:
+        payload: dict[str, Any] = {"name": name}
+        if description is not None:
+            payload["description"] = description
+        if parent_folder_id is not None:
+            payload["parent_folder_id"] = parent_folder_id
+
+        response = self._request(
+            "POST",
+            "/folder/",
+            "Failed to create folder",
+            json=payload,
+        )
+        return response.json()
+
+    def list_folders(self, parent_folder_id: str | None = None) -> dict[str, Any]:
+        params = {}
+        if parent_folder_id is not None:
+            params["parent_folder_id"] = parent_folder_id
+
+        response = self._request(
+            "GET",
+            "/folders/",
+            "Failed to list folders",
+            params=params,
+        )
+        return response.json()
--- a/pageindex/errors.py
+++ b/pageindex/errors.py
@ -18,7 +18,15 @@ class IndexingError(PageIndexError):
    pass


-class CloudAPIError(PageIndexError):
+class PageIndexAPIError(PageIndexError):
+    """PageIndex cloud API returned an error.
+
+    Kept for compatibility with the pageindex 0.2.x cloud SDK.
+    """
+    pass
+
+
+class CloudAPIError(PageIndexAPIError):
    """Cloud API returned error."""
    pass

--- a/pageindex/utils.py
+++ b/pageindex/utils.py
@ -15,6 +15,7 @@ load_dotenv()
 import logging
 import yaml
 from pathlib import Path
+from pprint import pprint
 from types import SimpleNamespace as config

 # Backward compatibility: support CHATGPT_API_KEY as alias for OPENAI_API_KEY
@ -23,6 +24,22 @@ if not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"):

 litellm.drop_params = True

+async def call_llm(prompt, api_key, model="gpt-4.1", temperature=0):
+    """Call an LLM to generate a response to a prompt.
+
+    Kept for compatibility with the pageindex 0.2.x SDK utility API.
+    """
+    import openai
+
+    client = openai.AsyncOpenAI(api_key=api_key)
+    response = await client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": prompt}],
+        temperature=temperature,
+    )
+    return response.choices[0].message.content.strip()
+
+
 def count_tokens(text, model=None):
    if not text:
        return 0
@ -463,12 +480,14 @@ def clean_structure_post(data):
            clean_structure_post(section)
    return data

-def remove_fields(data, fields=['text']):
+def remove_fields(data, fields=['text'], max_len=None):
    if isinstance(data, dict):
-        return {k: remove_fields(v, fields)
+        return {k: remove_fields(v, fields, max_len)
            for k, v in data.items() if k not in fields}
    elif isinstance(data, list):
-        return [remove_fields(item, fields) for item in data]
+        return [remove_fields(item, fields, max_len) for item in data]
+    elif isinstance(data, str):
+        return data[:max_len] + '...' if max_len is not None and len(data) > max_len else data
    return data

 def print_toc(tree, indent=0):
@ -684,27 +703,72 @@ class ConfigLoader:
        merged = {**self._default_dict, **user_dict}
        return config(**merged)

-def create_node_mapping(tree):
-    """Create a flat dict mapping node_id to node for quick lookup."""
+def create_node_mapping(tree, include_page_ranges=False, max_page=None):
+    """Create a mapping of node_id to node for quick lookup.
+
+    The optional page-range arguments are kept for compatibility with the
+    pageindex 0.2.x SDK utility API.
+    """
+    def get_all_nodes(nodes):
+        if isinstance(nodes, dict):
+            return [nodes] + [
+                child_node
+                for child in nodes.get('nodes', [])
+                for child_node in get_all_nodes(child)
+            ]
+        elif isinstance(nodes, list):
+            return [
+                child_node
+                for item in nodes
+                for child_node in get_all_nodes(item)
+            ]
+        return []
+
+    all_nodes = get_all_nodes(tree)
+
+    if not include_page_ranges:
+        return {node["node_id"]: node for node in all_nodes if node.get("node_id")}
+
    mapping = {}
-    def _traverse(nodes):
-        for node in nodes:
-            if node.get('node_id'):
-                mapping[node['node_id']] = node
-            if node.get('nodes'):
-                _traverse(node['nodes'])
-    _traverse(tree)
+    for i, node in enumerate(all_nodes):
+        if not node.get("node_id"):
+            continue
+        start_page = node.get("page_index", node.get("start_index"))
+        if node.get("end_index") is not None:
+            end_page = node.get("end_index")
+        elif i + 1 < len(all_nodes):
+            next_node = all_nodes[i + 1]
+            end_page = next_node.get("page_index", next_node.get("start_index"))
+        else:
+            end_page = max_page
+
+        mapping[node["node_id"]] = {
+            "node": node,
+            "start_index": start_page,
+            "end_index": end_page,
+        }
+
    return mapping

-def print_tree(tree, indent=0):
+def print_tree(tree, exclude_fields=None, indent=None):
+    if exclude_fields is None:
+        exclude_fields = ['text', 'page_index']
+    if isinstance(exclude_fields, int):
+        indent = exclude_fields
+        exclude_fields = None
+    if indent is None and exclude_fields is not None:
+        cleaned_tree = remove_fields(copy.deepcopy(tree), exclude_fields, max_len=40)
+        pprint(cleaned_tree, sort_dicts=False, width=100)
+        return
+
+    indent = indent or 0
    for node in tree:
        summary = node.get('summary') or node.get('prefix_summary', '')
        summary_str = f"  —  {summary[:60]}..." if summary else ""
        print('  ' * indent + f"[{node.get('node_id', '?')}] {node.get('title', '')}{summary_str}")
        if node.get('nodes'):
-            print_tree(node['nodes'], indent + 1)
+            print_tree(node['nodes'], exclude_fields=exclude_fields, indent=indent + 1)

 def print_wrapped(text, width=100):
    for line in text.splitlines():
        print(textwrap.fill(line, width=width))
-