feat: add PageIndex SDK with local/cloud dual-mode support (#207)

2026-07-18 21:21:05 +02:00 · 2026-04-06 22:51:04 +08:00 · 2026-04-06 22:51:04 +08:00 · c7fe93bb56
commit c7fe93bb56
parent f2dcffc0b7
45 changed files with 4225 additions and 274 deletions
--- a/pageindex/client.py
+++ b/pageindex/client.py
@ -1,18 +1,9 @@
-import os
-import uuid
-import json
-import asyncio
-import concurrent.futures
+# pageindex/client.py
+from __future__ import annotations
 from pathlib import Path
-
-import PyPDF2
-
-from .page_index import page_index
-from .page_index_md import md_to_tree
-from .retrieve import get_document, get_document_structure, get_page_content
-from .utils import ConfigLoader, remove_fields
-
-META_INDEX = "_meta.json"
+from .collection import Collection
+from .config import IndexConfig
+from .parser.protocol import DocumentParser


 def _normalize_retrieve_model(model: str) -> str:
@ -26,209 +17,145 @@ def _normalize_retrieve_model(model: str) -> str:


 class PageIndexClient:
-    """
-    A client for indexing and retrieving document content.
-    Flow: index() -> get_document() / get_document_structure() / get_page_content()
+    """PageIndex client — supports both local and cloud modes.

-    For agent-based QA, see examples/agentic_vectorless_rag_demo.py.
+    Args:
+        api_key: PageIndex cloud API key. When provided, cloud mode is used
+            and local-only params (model, storage_path, index_config, …) are ignored.
+        model: LLM model for indexing (local mode only, default: gpt-4o-2024-11-20).
+        retrieve_model: LLM model for agent QA (local mode only, default: same as model).
+        storage_path: Directory for SQLite DB and files (local mode only, default: ./.pageindex).
+        storage: Custom StorageEngine instance (local mode only).
+        index_config: Advanced indexing parameters (local mode only, optional).
+            Pass an IndexConfig instance or a dict. Defaults are sensible for most use cases.
+
+    Usage:
+        # Local mode (auto-detected when no api_key)
+        client = PageIndexClient(model="gpt-5.4")
+
+        # Cloud mode (auto-detected when api_key provided)
+        client = PageIndexClient(api_key="your-api-key")
+
+        # Or use LocalClient / CloudClient for explicit mode selection
    """
-    def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, workspace: str = None):
+
+    def __init__(self, api_key: str = None, model: str = None,
+                 retrieve_model: str = None, storage_path: str = None,
+                 storage=None, index_config: IndexConfig | dict = None):
        if api_key:
-            os.environ["OPENAI_API_KEY"] = api_key
-        elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"):
-            os.environ["OPENAI_API_KEY"] = os.getenv("CHATGPT_API_KEY")
-        self.workspace = Path(workspace).expanduser() if workspace else None
+            self._init_cloud(api_key)
+        else:
+            self._init_local(model, retrieve_model, storage_path, storage, index_config)
+
+    def _init_cloud(self, api_key: str):
+        from .backend.cloud import CloudBackend
+        self._backend = CloudBackend(api_key=api_key)
+
+    def _init_local(self, model: str = None, retrieve_model: str = None,
+                    storage_path: str = None, storage=None,
+                    index_config: IndexConfig | dict = None):
+        # Build IndexConfig: merge model/retrieve_model with index_config
        overrides = {}
        if model:
            overrides["model"] = model
        if retrieve_model:
            overrides["retrieve_model"] = retrieve_model
-        opt = ConfigLoader().load(overrides or None)
-        self.model = opt.model
-        self.retrieve_model = _normalize_retrieve_model(opt.retrieve_model or self.model)
-        if self.workspace:
-            self.workspace.mkdir(parents=True, exist_ok=True)
-        self.documents = {}
-        if self.workspace:
-            self._load_workspace()
-
-    def index(self, file_path: str, mode: str = "auto") -> str:
-        """Index a document. Returns a document_id."""
-        # Persist a canonical absolute path so workspace reloads do not
-        # reinterpret caller-relative paths against the workspace directory.
-        file_path = os.path.abspath(os.path.expanduser(file_path))
-        if not os.path.exists(file_path):
-            raise FileNotFoundError(f"File not found: {file_path}")
-
-        doc_id = str(uuid.uuid4())
-        ext = os.path.splitext(file_path)[1].lower()
-
-        is_pdf = ext == '.pdf'
-        is_md = ext in ['.md', '.markdown']
-
-        if mode == "pdf" or (mode == "auto" and is_pdf):
-            print(f"Indexing PDF: {file_path}")
-            result = page_index(
-                doc=file_path,
-                model=self.model,
-                if_add_node_summary='yes',
-                if_add_node_text='yes',
-                if_add_node_id='yes',
-                if_add_doc_description='yes'
-            )
-            # Extract per-page text so queries don't need the original PDF
-            pages = []
-            with open(file_path, 'rb') as f:
-                pdf_reader = PyPDF2.PdfReader(f)
-                for i, page in enumerate(pdf_reader.pages, 1):
-                    pages.append({'page': i, 'content': page.extract_text() or ''})
-
-            self.documents[doc_id] = {
-                'id': doc_id,
-                'type': 'pdf',
-                'path': file_path,
-                'doc_name': result.get('doc_name', ''),
-                'doc_description': result.get('doc_description', ''),
-                'page_count': len(pages),
-                'structure': result['structure'],
-                'pages': pages,
-            }
-
-        elif mode == "md" or (mode == "auto" and is_md):
-            print(f"Indexing Markdown: {file_path}")
-            coro = md_to_tree(
-                md_path=file_path,
-                if_thinning=False,
-                if_add_node_summary='yes',
-                summary_token_threshold=200,
-                model=self.model,
-                if_add_doc_description='yes',
-                if_add_node_text='yes',
-                if_add_node_id='yes'
-            )
-            try:
-                asyncio.get_running_loop()
-                with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
-                    result = pool.submit(asyncio.run, coro).result()
-            except RuntimeError:
-                result = asyncio.run(coro)
-            self.documents[doc_id] = {
-                'id': doc_id,
-                'type': 'md',
-                'path': file_path,
-                'doc_name': result.get('doc_name', ''),
-                'doc_description': result.get('doc_description', ''),
-                'line_count': result.get('line_count', 0),
-                'structure': result['structure'],
-            }
+        if isinstance(index_config, IndexConfig):
+            opt = index_config.model_copy(update=overrides)
+        elif isinstance(index_config, dict):
+            merged = {**index_config, **overrides}  # explicit model/retrieve_model win
+            opt = IndexConfig(**merged)
        else:
-            raise ValueError(f"Unsupported file format for: {file_path}")
+            opt = IndexConfig(**overrides) if overrides else IndexConfig()

-        print(f"Indexing complete. Document ID: {doc_id}")
-        if self.workspace:
-            self._save_doc(doc_id)
-        return doc_id
+        self._validate_llm_provider(opt.model)
+
+        storage_path = Path(storage_path or ".pageindex").resolve()
+        storage_path.mkdir(parents=True, exist_ok=True)
+
+        from .storage.sqlite import SQLiteStorage
+        from .backend.local import LocalBackend
+        storage_engine = storage or SQLiteStorage(str(storage_path / "pageindex.db"))
+        self._backend = LocalBackend(
+            storage=storage_engine,
+            files_dir=str(storage_path / "files"),
+            model=opt.model,
+            retrieve_model=_normalize_retrieve_model(opt.retrieve_model or opt.model),
+            index_config=opt,
+        )

    @staticmethod
-    def _make_meta_entry(doc: dict) -> dict:
-        """Build a lightweight meta entry from a document dict."""
-        entry = {
-            'type': doc.get('type', ''),
-            'doc_name': doc.get('doc_name', ''),
-            'doc_description': doc.get('doc_description', ''),
-            'path': doc.get('path', ''),
-        }
-        if doc.get('type') == 'pdf':
-            entry['page_count'] = doc.get('page_count')
-        elif doc.get('type') == 'md':
-            entry['line_count'] = doc.get('line_count')
-        return entry
-
-    @staticmethod
-    def _read_json(path) -> dict | None:
-        """Read a JSON file, returning None on any error."""
+    def _validate_llm_provider(model: str) -> None:
+        """Validate model and check API key via litellm. Warns if key seems missing."""
        try:
-            with open(path, "r", encoding="utf-8") as f:
-                return json.load(f)
-        except (json.JSONDecodeError, OSError) as e:
-            print(f"Warning: corrupt {Path(path).name}: {e}")
-            return None
-
-    def _save_doc(self, doc_id: str):
-        doc = self.documents[doc_id].copy()
-        # Strip text from structure nodes — redundant with pages (PDF only)
-        if doc.get('structure') and doc.get('type') == 'pdf':
-            doc['structure'] = remove_fields(doc['structure'], fields=['text'])
-        path = self.workspace / f"{doc_id}.json"
-        with open(path, "w", encoding="utf-8") as f:
-            json.dump(doc, f, ensure_ascii=False, indent=2)
-        self._save_meta(doc_id, self._make_meta_entry(doc))
-        # Drop heavy fields; will lazy-load on demand
-        self.documents[doc_id].pop('structure', None)
-        self.documents[doc_id].pop('pages', None)
-
-    def _rebuild_meta(self) -> dict:
-        """Scan individual doc JSON files and return a meta dict."""
-        meta = {}
-        for path in self.workspace.glob("*.json"):
-            if path.name == META_INDEX:
-                continue
-            doc = self._read_json(path)
-            if doc and isinstance(doc, dict):
-                meta[path.stem] = self._make_meta_entry(doc)
-        return meta
-
-    def _read_meta(self) -> dict | None:
-        """Read and validate _meta.json, returning None on any corruption."""
-        meta = self._read_json(self.workspace / META_INDEX)
-        if meta is not None and not isinstance(meta, dict):
-            print(f"Warning: {META_INDEX} is not a JSON object, ignoring")
-            return None
-        return meta
-
-    def _save_meta(self, doc_id: str, entry: dict):
-        meta = self._read_meta() or self._rebuild_meta()
-        meta[doc_id] = entry
-        meta_path = self.workspace / META_INDEX
-        with open(meta_path, "w", encoding="utf-8") as f:
-            json.dump(meta, f, ensure_ascii=False, indent=2)
-
-    def _load_workspace(self):
-        meta = self._read_meta()
-        if meta is None:
-            meta = self._rebuild_meta()
-            if meta:
-                print(f"Loaded {len(meta)} document(s) from workspace (legacy mode).")
-        for doc_id, entry in meta.items():
-            doc = dict(entry, id=doc_id)
-            if doc.get('path') and not os.path.isabs(doc['path']):
-                doc['path'] = str((self.workspace / doc['path']).resolve())
-            self.documents[doc_id] = doc
-
-    def _ensure_doc_loaded(self, doc_id: str):
-        """Load full document JSON on demand (structure, pages, etc.)."""
-        doc = self.documents.get(doc_id)
-        if not doc or doc.get('structure') is not None:
+            import litellm
+            litellm.model_cost_map_url = ""
+            _, provider, _, _ = litellm.get_llm_provider(model=model)
+        except Exception:
            return
-        full = self._read_json(self.workspace / f"{doc_id}.json")
-        if not full:
-            return
-        doc['structure'] = full.get('structure', [])
-        if full.get('pages'):
-            doc['pages'] = full['pages']

-    def get_document(self, doc_id: str) -> str:
-        """Return document metadata JSON."""
-        return get_document(self.documents, doc_id)
+        key = litellm.get_api_key(llm_provider=provider, dynamic_api_key=None)
+        if not key:
+            import os
+            common_var = f"{provider.upper()}_API_KEY"
+            if not os.getenv(common_var):
+                from .errors import PageIndexError
+                raise PageIndexError(
+                    f"API key not configured for provider '{provider}' (model: {model}). "
+                    f"Set the {common_var} environment variable."
+                )

-    def get_document_structure(self, doc_id: str) -> str:
-        """Return document tree structure JSON (without text fields)."""
-        if self.workspace:
-            self._ensure_doc_loaded(doc_id)
-        return get_document_structure(self.documents, doc_id)
+    def collection(self, name: str = "default") -> Collection:
+        """Get or create a collection. Defaults to 'default'."""
+        self._backend.get_or_create_collection(name)
+        return Collection(name=name, backend=self._backend)

-    def get_page_content(self, doc_id: str, pages: str) -> str:
-        """Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
-        if self.workspace:
-            self._ensure_doc_loaded(doc_id)
-        return get_page_content(self.documents, doc_id, pages)
+    def list_collections(self) -> list[str]:
+        return self._backend.list_collections()
+
+    def delete_collection(self, name: str) -> None:
+        self._backend.delete_collection(name)
+
+    def register_parser(self, parser: DocumentParser) -> None:
+        """Register a custom document parser. Only available in local mode."""
+        if not hasattr(self._backend, 'register_parser'):
+            from .errors import PageIndexError
+            raise PageIndexError("Custom parsers are not supported in cloud mode")
+        self._backend.register_parser(parser)
+
+
+class LocalClient(PageIndexClient):
+    """Local mode — indexes and queries documents on your machine.
+
+    Args:
+        model: LLM model for indexing (default: gpt-4o-2024-11-20)
+        retrieve_model: LLM model for agent QA (default: same as model)
+        storage_path: Directory for SQLite DB and files (default: ./.pageindex)
+        storage: Custom StorageEngine instance (default: SQLiteStorage)
+        index_config: Advanced indexing parameters. Pass an IndexConfig instance
+            or a dict. All fields have sensible defaults — most users don't need this.
+
+    Example::
+
+        # Simple — defaults are fine
+        client = LocalClient(model="gpt-5.4")
+
+        # Advanced — tune indexing parameters
+        from pageindex.config import IndexConfig
+        client = LocalClient(
+            model="gpt-5.4",
+            index_config=IndexConfig(toc_check_page_num=30),
+        )
+    """
+
+    def __init__(self, model: str = None, retrieve_model: str = None,
+                 storage_path: str = None, storage=None,
+                 index_config: IndexConfig | dict = None):
+        self._init_local(model, retrieve_model, storage_path, storage, index_config)
+
+
+class CloudClient(PageIndexClient):
+    """Cloud mode — fully managed by PageIndex cloud service. No LLM key needed."""
+
+    def __init__(self, api_key: str):
+        self._init_cloud(api_key)