feat: add PageIndex SDK with local/cloud dual-mode support (#207)

2026-05-05 21:12:37 +02:00 · 2026-04-06 22:51:04 +08:00 · 2026-04-06 22:51:04 +08:00 · c7fe93bb56
commit c7fe93bb56
parent f2dcffc0b7
45 changed files with 4225 additions and 274 deletions
--- a/pageindex/backend/init.py
+++ b/pageindex/backend/init.py
--- a/pageindex/backend/cloud.py
+++ b/pageindex/backend/cloud.py
@ -0,0 +1,352 @@
+# pageindex/backend/cloud.py
+"""CloudBackend — connects to PageIndex cloud service (api.pageindex.ai).
+
+API reference: https://github.com/VectifyAI/pageindex_sdk
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+import re
+import time
+import urllib.parse
+import requests
+from typing import AsyncIterator
+
+from .protocol import AgentTools
+from ..errors import CloudAPIError, PageIndexError
+from ..events import QueryEvent
+
+logger = logging.getLogger(__name__)
+
+API_BASE = "https://api.pageindex.ai"
+
+_INTERNAL_TOOLS = frozenset({"ToolSearch", "Read", "Grep", "Glob", "Bash", "Edit", "Write"})
+
+
+class CloudBackend:
+    def __init__(self, api_key: str):
+        self._api_key = api_key
+        self._headers = {"api_key": api_key}
+        self._folder_id_cache: dict[str, str | None] = {}
+        self._folder_warning_shown = False
+
+    # ── HTTP helpers ──────────────────────────────────────────────────────
+
+    def _warn_folder_upgrade(self) -> None:
+        if not self._folder_warning_shown:
+            logger.warning(
+                "Folders (collections) require a Max plan. "
+                "All documents are stored in a single global space — collection names are ignored. "
+                "Upgrade at https://dash.pageindex.ai/subscription"
+            )
+            self._folder_warning_shown = True
+
+    def _request(self, method: str, path: str, **kwargs) -> dict:
+        url = f"{API_BASE}{path}"
+        for attempt in range(3):
+            try:
+                resp = requests.request(method, url, headers=self._headers, timeout=30, **kwargs)
+                if resp.status_code in (429, 500, 502, 503):
+                    logger.warning("Cloud API %s %s returned %d, retrying...", method, path, resp.status_code)
+                    time.sleep(2 ** attempt)
+                    continue
+                if resp.status_code != 200:
+                    body = resp.text[:500] if resp.text else ""
+                    raise CloudAPIError(f"Cloud API error {resp.status_code}: {body}")
+                return resp.json() if resp.content else {}
+            except requests.RequestException as e:
+                if attempt == 2:
+                    raise CloudAPIError(f"Cloud API request failed: {e}") from e
+                time.sleep(2 ** attempt)
+        raise CloudAPIError("Max retries exceeded")
+
+    @staticmethod
+    def _validate_collection_name(name: str) -> None:
+        if not re.match(r'^[a-zA-Z0-9_-]{1,128}$', name):
+            raise PageIndexError(
+                f"Invalid collection name: {name!r}. "
+                "Must be 1-128 chars of [a-zA-Z0-9_-]."
+            )
+
+    @staticmethod
+    def _enc(value: str) -> str:
+        return urllib.parse.quote(value, safe="")
+
+    # ── Collection management (mapped to folders) ─────────────────────────
+
+    def create_collection(self, name: str) -> None:
+        self._validate_collection_name(name)
+        try:
+            resp = self._request("POST", "/folder/", json={"name": name})
+            self._folder_id_cache[name] = resp.get("folder", {}).get("id")
+        except CloudAPIError as e:
+            if "403" in str(e):
+                self._warn_folder_upgrade()
+                self._folder_id_cache[name] = None
+            else:
+                raise
+
+    def get_or_create_collection(self, name: str) -> None:
+        self._validate_collection_name(name)
+        try:
+            data = self._request("GET", "/folders/")
+            for folder in data.get("folders", []):
+                if folder.get("name") == name:
+                    self._folder_id_cache[name] = folder["id"]
+                    return
+            resp = self._request("POST", "/folder/", json={"name": name})
+            self._folder_id_cache[name] = resp.get("folder", {}).get("id")
+        except CloudAPIError as e:
+            if "403" in str(e):
+                self._warn_folder_upgrade()
+                self._folder_id_cache[name] = None
+            else:
+                raise
+
+    def _get_folder_id(self, name: str) -> str | None:
+        """Resolve collection name to folder ID. Returns None if folders not available."""
+        if name in self._folder_id_cache:
+            return self._folder_id_cache.get(name)
+        try:
+            data = self._request("GET", "/folders/")
+            for folder in data.get("folders", []):
+                if folder.get("name") == name:
+                    self._folder_id_cache[name] = folder["id"]
+                    return folder["id"]
+        except CloudAPIError:
+            pass
+        self._folder_id_cache[name] = None
+        return None
+
+    def list_collections(self) -> list[str]:
+        data = self._request("GET", "/folders/")
+        return [f["name"] for f in data.get("folders", [])]
+
+    def delete_collection(self, name: str) -> None:
+        folder_id = self._get_folder_id(name)
+        if folder_id:
+            self._request("DELETE", f"/folder/{self._enc(folder_id)}/")
+
+    # ── Document management ───────────────────────────────────────────────
+
+    def add_document(self, collection: str, file_path: str) -> str:
+        folder_id = self._get_folder_id(collection)
+        data = {"if_retrieval": "true"}
+        if folder_id:
+            data["folder_id"] = folder_id
+
+        with open(file_path, "rb") as f:
+            resp = self._request("POST", "/doc/", files={"file": f}, data=data)
+
+        doc_id = resp["doc_id"]
+
+        # Poll until retrieval-ready
+        for _ in range(120):  # 10 min max
+            tree_resp = self._request("GET", f"/doc/{self._enc(doc_id)}/", params={"type": "tree"})
+            if tree_resp.get("retrieval_ready"):
+                return doc_id
+            status = tree_resp.get("status", "")
+            if status == "failed":
+                raise CloudAPIError(f"Document {doc_id} indexing failed")
+            time.sleep(5)
+
+        raise CloudAPIError(f"Document {doc_id} indexing timed out")
+
+    def get_document(self, collection: str, doc_id: str, include_text: bool = False) -> dict:
+        resp = self._request("GET", f"/doc/{self._enc(doc_id)}/metadata/")
+        # Fetch structure in the same call via tree endpoint
+        tree_resp = self._request("GET", f"/doc/{self._enc(doc_id)}/",
+                                  params={"type": "tree", "summary": "true"})
+        raw_tree = tree_resp.get("tree", tree_resp.get("structure", tree_resp.get("result", [])))
+        return {
+            "doc_id": resp.get("id", doc_id),
+            "doc_name": resp.get("name", ""),
+            "doc_description": resp.get("description", ""),
+            "doc_type": "pdf",
+            "status": resp.get("status", ""),
+            "structure": self._normalize_tree(raw_tree),
+        }
+
+    def get_document_structure(self, collection: str, doc_id: str) -> list:
+        resp = self._request("GET", f"/doc/{self._enc(doc_id)}/", params={"type": "tree", "summary": "true"})
+        raw_tree = resp.get("tree", resp.get("structure", resp.get("result", [])))
+        return self._normalize_tree(raw_tree)
+
+    def get_page_content(self, collection: str, doc_id: str, pages: str) -> list:
+        resp = self._request("GET", f"/doc/{self._enc(doc_id)}/", params={"type": "ocr", "format": "page"})
+        # Filter to requested pages
+        from ..index.utils import parse_pages
+        page_nums = set(parse_pages(pages))
+        all_pages = resp.get("pages", resp.get("ocr", resp.get("result", [])))
+        if isinstance(all_pages, list):
+            return [
+                {"page": p.get("page", p.get("page_index")),
+                 "content": p.get("content", p.get("markdown", ""))}
+                for p in all_pages
+                if p.get("page", p.get("page_index")) in page_nums
+            ]
+        return []
+
+    @staticmethod
+    def _normalize_tree(nodes: list) -> list:
+        """Normalize cloud tree nodes to match local schema."""
+        result = []
+        for node in nodes:
+            normalized = {
+                "title": node.get("title", ""),
+                "node_id": node.get("node_id", ""),
+                "summary": node.get("summary", node.get("prefix_summary", "")),
+                "start_index": node.get("start_index", node.get("page_index")),
+                "end_index": node.get("end_index", node.get("page_index")),
+            }
+            if "text" in node:
+                normalized["text"] = node["text"]
+            children = node.get("nodes", [])
+            if children:
+                normalized["nodes"] = CloudBackend._normalize_tree(children)
+            result.append(normalized)
+        return result
+
+    def list_documents(self, collection: str) -> list[dict]:
+        folder_id = self._get_folder_id(collection)
+        params = {"limit": 100}
+        if folder_id:
+            params["folder_id"] = folder_id
+        data = self._request("GET", "/docs/", params=params)
+        return [
+            {"doc_id": d.get("id", ""), "doc_name": d.get("name", ""), "doc_type": "pdf"}
+            for d in data.get("documents", [])
+        ]
+
+    def delete_document(self, collection: str, doc_id: str) -> None:
+        self._request("DELETE", f"/doc/{self._enc(doc_id)}/")
+
+    # ── Query (uses cloud chat/completions, no LLM key needed) ────────────
+
+    def query(self, collection: str, question: str, doc_ids: list[str] | None = None) -> str:
+        """Non-streaming query via cloud chat/completions."""
+        doc_id = doc_ids if doc_ids else self._get_all_doc_ids(collection)
+        resp = self._request("POST", "/chat/completions/", json={
+            "messages": [{"role": "user", "content": question}],
+            "doc_id": doc_id,
+            "stream": False,
+        })
+        # Extract answer from response
+        choices = resp.get("choices", [])
+        if choices:
+            return choices[0].get("message", {}).get("content", "")
+        return resp.get("content", resp.get("answer", ""))
+
+    async def query_stream(self, collection: str, question: str,
+                           doc_ids: list[str] | None = None) -> AsyncIterator[QueryEvent]:
+        """Streaming query via cloud chat/completions SSE.
+
+        Events are yielded in real-time as they arrive from the server.
+        A background thread handles the blocking HTTP stream and pushes
+        events through an asyncio.Queue for true async streaming.
+        """
+        import asyncio
+        import threading
+
+        doc_id = doc_ids if doc_ids else self._get_all_doc_ids(collection)
+        headers = self._headers
+        queue: asyncio.Queue[QueryEvent | None] = asyncio.Queue()
+        loop = asyncio.get_event_loop()
+
+        def _stream():
+            """Background thread: read SSE and push events to queue."""
+            resp = requests.post(
+                f"{API_BASE}/chat/completions/",
+                headers=headers,
+                json={
+                    "messages": [{"role": "user", "content": question}],
+                    "doc_id": doc_id,
+                    "stream": True,
+                    "stream_metadata": True,
+                },
+                stream=True,
+                timeout=120,
+            )
+            try:
+                if resp.status_code != 200:
+                    body = resp.text[:500] if resp.text else ""
+                    loop.call_soon_threadsafe(
+                        queue.put_nowait,
+                        QueryEvent(type="answer_done",
+                                   data=f"Cloud streaming error {resp.status_code}: {body}"),
+                    )
+                    return
+
+                current_tool_name = None
+                current_tool_args: list[str] = []
+
+                for line in resp.iter_lines(decode_unicode=True):
+                    if not line or not line.startswith("data: "):
+                        continue
+                    data_str = line[6:]
+                    if data_str.strip() == "[DONE]":
+                        break
+                    try:
+                        chunk = json.loads(data_str)
+                    except json.JSONDecodeError:
+                        continue
+
+                    meta = chunk.get("block_metadata", {})
+                    block_type = meta.get("type", "")
+                    choices = chunk.get("choices", [])
+                    delta = choices[0].get("delta", {}) if choices else {}
+                    content = delta.get("content", "")
+
+                    if block_type == "mcp_tool_use_start":
+                        current_tool_name = meta.get("tool_name", "")
+                        current_tool_args = []
+
+                    elif block_type == "tool_use":
+                        if content:
+                            current_tool_args.append(content)
+
+                    elif block_type == "tool_use_stop":
+                        if current_tool_name and current_tool_name not in _INTERNAL_TOOLS:
+                            args_str = "".join(current_tool_args)
+                            loop.call_soon_threadsafe(
+                                queue.put_nowait,
+                                QueryEvent(type="tool_call", data={
+                                    "name": current_tool_name,
+                                    "args": args_str,
+                                }),
+                            )
+                        current_tool_name = None
+                        current_tool_args = []
+
+                    elif block_type == "text" and content:
+                        loop.call_soon_threadsafe(
+                            queue.put_nowait,
+                            QueryEvent(type="answer_delta", data=content),
+                        )
+
+            finally:
+                resp.close()
+                loop.call_soon_threadsafe(queue.put_nowait, None)  # sentinel
+
+        thread = threading.Thread(target=_stream, daemon=True)
+        thread.start()
+
+        while True:
+            event = await queue.get()
+            if event is None:
+                break
+            yield event
+
+        thread.join(timeout=5)
+
+    def _get_all_doc_ids(self, collection: str) -> list[str]:
+        """Get all document IDs in a collection."""
+        docs = self.list_documents(collection)
+        return [d["doc_id"] for d in docs]
+
+    # ── Not used in cloud mode ────────────────────────────────────────────
+
+    def get_agent_tools(self, collection: str, doc_ids: list[str] | None = None) -> AgentTools:
+        """Not used in cloud mode — query goes through chat/completions."""
+        return AgentTools()
--- a/pageindex/backend/local.py
+++ b/pageindex/backend/local.py
@ -0,0 +1,245 @@
+# pageindex/backend/local.py
+import hashlib
+import os
+import re
+import uuid
+import shutil
+from pathlib import Path
+
+from ..parser.protocol import DocumentParser, ParsedDocument
+from ..parser.pdf import PdfParser
+from ..parser.markdown import MarkdownParser
+from ..storage.protocol import StorageEngine
+from ..index.pipeline import build_index
+from ..index.utils import parse_pages, get_pdf_page_content, get_md_page_content, remove_fields
+from ..backend.protocol import AgentTools
+from ..errors import FileTypeError, DocumentNotFoundError, IndexingError, PageIndexError
+
+_COLLECTION_NAME_RE = re.compile(r'^[a-zA-Z0-9_-]{1,128}$')
+
+
+class LocalBackend:
+    def __init__(self, storage: StorageEngine, files_dir: str, model: str = None,
+                 retrieve_model: str = None, index_config=None):
+        self._storage = storage
+        self._files_dir = Path(files_dir)
+        self._model = model
+        self._retrieve_model = retrieve_model or model
+        self._index_config = index_config
+        self._parsers: list[DocumentParser] = [PdfParser(), MarkdownParser()]
+
+    def register_parser(self, parser: DocumentParser) -> None:
+        self._parsers.insert(0, parser)  # user parsers checked first
+
+    def get_retrieve_model(self) -> str | None:
+        return self._retrieve_model
+
+    def _resolve_parser(self, file_path: str) -> DocumentParser:
+        ext = os.path.splitext(file_path)[1].lower()
+        for parser in self._parsers:
+            if ext in parser.supported_extensions():
+                return parser
+        raise FileTypeError(f"No parser for extension: {ext}")
+
+    # Collection management
+    def _validate_collection_name(self, name: str) -> None:
+        if not _COLLECTION_NAME_RE.match(name):
+            raise PageIndexError(f"Invalid collection name: {name!r}. Must be 1-128 chars of [a-zA-Z0-9_-].")
+
+    def create_collection(self, name: str) -> None:
+        self._validate_collection_name(name)
+        self._storage.create_collection(name)
+
+    def get_or_create_collection(self, name: str) -> None:
+        self._validate_collection_name(name)
+        self._storage.get_or_create_collection(name)
+
+    def list_collections(self) -> list[str]:
+        return self._storage.list_collections()
+
+    def delete_collection(self, name: str) -> None:
+        self._storage.delete_collection(name)
+        col_dir = self._files_dir / name
+        if col_dir.exists():
+            shutil.rmtree(col_dir)
+
+    @staticmethod
+    def _file_hash(file_path: str) -> str:
+        """Compute SHA-256 hash of a file."""
+        h = hashlib.sha256()
+        with open(file_path, "rb") as f:
+            for chunk in iter(lambda: f.read(65536), b""):
+                h.update(chunk)
+        return h.hexdigest()
+
+    # Document management
+    def add_document(self, collection: str, file_path: str) -> str:
+        file_path = os.path.realpath(file_path)
+        if not os.path.isfile(file_path):
+            raise FileTypeError(f"Not a regular file: {file_path}")
+        parser = self._resolve_parser(file_path)
+
+        # Dedup: skip if same file already indexed in this collection
+        file_hash = self._file_hash(file_path)
+        existing_id = self._storage.find_document_by_hash(collection, file_hash)
+        if existing_id:
+            return existing_id
+
+        doc_id = str(uuid.uuid4())
+
+        # Copy file to managed directory
+        ext = os.path.splitext(file_path)[1]
+        col_dir = self._files_dir / collection
+        col_dir.mkdir(parents=True, exist_ok=True)
+        managed_path = col_dir / f"{doc_id}{ext}"
+        shutil.copy2(file_path, managed_path)
+
+        try:
+            # Store images alongside the document: files/{collection}/{doc_id}/images/
+            images_dir = str(col_dir / doc_id / "images")
+            parsed = parser.parse(file_path, model=self._model, images_dir=images_dir)
+            result = build_index(parsed, model=self._model, opt=self._index_config)
+
+            # Cache page text for fast retrieval (avoids re-reading files)
+            pages = [{"page": n.index, "content": n.content,
+                      **({"images": n.images} if n.images else {})}
+                     for n in parsed.nodes if n.content]
+
+            # Strip text from structure to save storage space (PDF only;
+            # markdown needs text in structure for fallback retrieval)
+            doc_type = ext.lstrip(".")
+            if doc_type == "pdf":
+                clean_structure = remove_fields(result["structure"], fields=["text"])
+            else:
+                clean_structure = result["structure"]
+
+            self._storage.save_document(collection, doc_id, {
+                "doc_name": parsed.doc_name,
+                "doc_description": result.get("doc_description", ""),
+                "file_path": str(managed_path),
+                "file_hash": file_hash,
+                "doc_type": doc_type,
+                "structure": clean_structure,
+                "pages": pages,
+            })
+        except Exception as e:
+            managed_path.unlink(missing_ok=True)
+            doc_dir = col_dir / doc_id
+            if doc_dir.exists():
+                shutil.rmtree(doc_dir)
+            raise IndexingError(f"Failed to index {file_path}: {e}") from e
+
+        return doc_id
+
+    def get_document(self, collection: str, doc_id: str, include_text: bool = False) -> dict:
+        """Get document metadata with structure.
+
+        Args:
+            include_text: If True, populate each structure node's 'text' field
+                from cached page content. WARNING: may be very large — do NOT
+                use in agent/LLM contexts as it can exhaust the context window.
+        """
+        doc = self._storage.get_document(collection, doc_id)
+        if not doc:
+            return {}
+        doc["structure"] = self._storage.get_document_structure(collection, doc_id)
+        if include_text:
+            pages = self._storage.get_pages(collection, doc_id) or []
+            page_map = {p["page"]: p["content"] for p in pages}
+            self._fill_node_text(doc["structure"], page_map)
+        return doc
+
+    @staticmethod
+    def _fill_node_text(nodes: list, page_map: dict) -> None:
+        """Recursively fill 'text' on structure nodes from cached page content."""
+        for node in nodes:
+            start = node.get("start_index")
+            end = node.get("end_index")
+            if start is not None and end is not None:
+                node["text"] = "\n".join(
+                    page_map.get(p, "") for p in range(start, end + 1)
+                )
+            if "nodes" in node:
+                LocalBackend._fill_node_text(node["nodes"], page_map)
+
+    def get_document_structure(self, collection: str, doc_id: str) -> list:
+        return self._storage.get_document_structure(collection, doc_id)
+
+    def get_page_content(self, collection: str, doc_id: str, pages: str) -> list:
+        doc = self._storage.get_document(collection, doc_id)
+        if not doc:
+            raise DocumentNotFoundError(f"Document {doc_id} not found")
+        page_nums = parse_pages(pages)
+
+        # Try cached pages first (fast, no file I/O)
+        cached_pages = self._storage.get_pages(collection, doc_id)
+        if cached_pages:
+            return [p for p in cached_pages if p["page"] in page_nums]
+
+        # Fallback to reading from file
+        if doc["doc_type"] == "pdf":
+            return get_pdf_page_content(doc["file_path"], page_nums)
+        else:
+            structure = self._storage.get_document_structure(collection, doc_id)
+            return get_md_page_content(structure, page_nums)
+
+    def list_documents(self, collection: str) -> list[dict]:
+        return self._storage.list_documents(collection)
+
+    def delete_document(self, collection: str, doc_id: str) -> None:
+        doc = self._storage.get_document(collection, doc_id)
+        if doc and doc.get("file_path"):
+            Path(doc["file_path"]).unlink(missing_ok=True)
+        # Clean up images directory: files/{collection}/{doc_id}/
+        doc_dir = self._files_dir / collection / doc_id
+        if doc_dir.exists():
+            shutil.rmtree(doc_dir)
+        self._storage.delete_document(collection, doc_id)
+
+    def get_agent_tools(self, collection: str, doc_ids: list[str] | None = None) -> AgentTools:
+        from agents import function_tool
+        import json
+        storage = self._storage
+        col_name = collection
+        backend = self
+        filter_ids = doc_ids
+
+        @function_tool
+        def list_documents() -> str:
+            """List all documents in the collection."""
+            docs = storage.list_documents(col_name)
+            if filter_ids:
+                docs = [d for d in docs if d["doc_id"] in filter_ids]
+            return json.dumps(docs)
+
+        @function_tool
+        def get_document(doc_id: str) -> str:
+            """Get document metadata."""
+            return json.dumps(storage.get_document(col_name, doc_id))
+
+        @function_tool
+        def get_document_structure(doc_id: str) -> str:
+            """Get document tree structure (without text)."""
+            structure = storage.get_document_structure(col_name, doc_id)
+            return json.dumps(remove_fields(structure, fields=["text"]), ensure_ascii=False)
+
+        @function_tool
+        def get_page_content(doc_id: str, pages: str) -> str:
+            """Get page content. Use tight ranges: '5-7', '3,8', '12'."""
+            result = backend.get_page_content(col_name, doc_id, pages)
+            return json.dumps(result, ensure_ascii=False)
+
+        return AgentTools(function_tools=[list_documents, get_document, get_document_structure, get_page_content])
+
+    def query(self, collection: str, question: str, doc_ids: list[str] | None = None) -> str:
+        from ..agent import AgentRunner
+        tools = self.get_agent_tools(collection, doc_ids)
+        return AgentRunner(tools=tools, model=self._retrieve_model).run(question)
+
+    async def query_stream(self, collection: str, question: str,
+                           doc_ids: list[str] | None = None):
+        from ..agent import QueryStream
+        tools = self.get_agent_tools(collection, doc_ids)
+        stream = QueryStream(tools=tools, question=question, model=self._retrieve_model)
+        async for event in stream:
+            yield event
--- a/pageindex/backend/protocol.py
+++ b/pageindex/backend/protocol.py
@ -0,0 +1,34 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Protocol, Any, AsyncIterator, runtime_checkable
+
+from ..events import QueryEvent
+
+
+@dataclass
+class AgentTools:
+    """Structured container for agent tool configuration (local mode only)."""
+    function_tools: list[Any] = field(default_factory=list)
+    mcp_servers: list[Any] = field(default_factory=list)
+
+
+@runtime_checkable
+class Backend(Protocol):
+    # Collection management
+    def create_collection(self, name: str) -> None: ...
+    def get_or_create_collection(self, name: str) -> None: ...
+    def list_collections(self) -> list[str]: ...
+    def delete_collection(self, name: str) -> None: ...
+
+    # Document management
+    def add_document(self, collection: str, file_path: str) -> str: ...
+    def get_document(self, collection: str, doc_id: str, include_text: bool = False) -> dict: ...
+    def get_document_structure(self, collection: str, doc_id: str) -> list: ...
+    def get_page_content(self, collection: str, doc_id: str, pages: str) -> list: ...
+    def list_documents(self, collection: str) -> list[dict]: ...
+    def delete_document(self, collection: str, doc_id: str) -> None: ...
+
+    # Query
+    def query(self, collection: str, question: str, doc_ids: list[str] | None = None) -> str: ...
+    async def query_stream(self, collection: str, question: str,
+                           doc_ids: list[str] | None = None) -> AsyncIterator[QueryEvent]: ...