feat: add PageIndex SDK with local/cloud dual-mode support (#207)

This commit is contained in:
Kylin 2026-04-06 22:51:04 +08:00 committed by Ray
parent f2dcffc0b7
commit c7fe93bb56
45 changed files with 4225 additions and 274 deletions

View file

352
pageindex/backend/cloud.py Normal file
View file

@ -0,0 +1,352 @@
# pageindex/backend/cloud.py
"""CloudBackend — connects to PageIndex cloud service (api.pageindex.ai).
API reference: https://github.com/VectifyAI/pageindex_sdk
"""
from __future__ import annotations
import json
import logging
import os
import re
import time
import urllib.parse
import requests
from typing import AsyncIterator
from .protocol import AgentTools
from ..errors import CloudAPIError, PageIndexError
from ..events import QueryEvent
logger = logging.getLogger(__name__)
API_BASE = "https://api.pageindex.ai"
_INTERNAL_TOOLS = frozenset({"ToolSearch", "Read", "Grep", "Glob", "Bash", "Edit", "Write"})
class CloudBackend:
def __init__(self, api_key: str):
self._api_key = api_key
self._headers = {"api_key": api_key}
self._folder_id_cache: dict[str, str | None] = {}
self._folder_warning_shown = False
# ── HTTP helpers ──────────────────────────────────────────────────────
def _warn_folder_upgrade(self) -> None:
if not self._folder_warning_shown:
logger.warning(
"Folders (collections) require a Max plan. "
"All documents are stored in a single global space — collection names are ignored. "
"Upgrade at https://dash.pageindex.ai/subscription"
)
self._folder_warning_shown = True
def _request(self, method: str, path: str, **kwargs) -> dict:
url = f"{API_BASE}{path}"
for attempt in range(3):
try:
resp = requests.request(method, url, headers=self._headers, timeout=30, **kwargs)
if resp.status_code in (429, 500, 502, 503):
logger.warning("Cloud API %s %s returned %d, retrying...", method, path, resp.status_code)
time.sleep(2 ** attempt)
continue
if resp.status_code != 200:
body = resp.text[:500] if resp.text else ""
raise CloudAPIError(f"Cloud API error {resp.status_code}: {body}")
return resp.json() if resp.content else {}
except requests.RequestException as e:
if attempt == 2:
raise CloudAPIError(f"Cloud API request failed: {e}") from e
time.sleep(2 ** attempt)
raise CloudAPIError("Max retries exceeded")
@staticmethod
def _validate_collection_name(name: str) -> None:
if not re.match(r'^[a-zA-Z0-9_-]{1,128}$', name):
raise PageIndexError(
f"Invalid collection name: {name!r}. "
"Must be 1-128 chars of [a-zA-Z0-9_-]."
)
@staticmethod
def _enc(value: str) -> str:
return urllib.parse.quote(value, safe="")
# ── Collection management (mapped to folders) ─────────────────────────
def create_collection(self, name: str) -> None:
self._validate_collection_name(name)
try:
resp = self._request("POST", "/folder/", json={"name": name})
self._folder_id_cache[name] = resp.get("folder", {}).get("id")
except CloudAPIError as e:
if "403" in str(e):
self._warn_folder_upgrade()
self._folder_id_cache[name] = None
else:
raise
def get_or_create_collection(self, name: str) -> None:
self._validate_collection_name(name)
try:
data = self._request("GET", "/folders/")
for folder in data.get("folders", []):
if folder.get("name") == name:
self._folder_id_cache[name] = folder["id"]
return
resp = self._request("POST", "/folder/", json={"name": name})
self._folder_id_cache[name] = resp.get("folder", {}).get("id")
except CloudAPIError as e:
if "403" in str(e):
self._warn_folder_upgrade()
self._folder_id_cache[name] = None
else:
raise
def _get_folder_id(self, name: str) -> str | None:
"""Resolve collection name to folder ID. Returns None if folders not available."""
if name in self._folder_id_cache:
return self._folder_id_cache.get(name)
try:
data = self._request("GET", "/folders/")
for folder in data.get("folders", []):
if folder.get("name") == name:
self._folder_id_cache[name] = folder["id"]
return folder["id"]
except CloudAPIError:
pass
self._folder_id_cache[name] = None
return None
def list_collections(self) -> list[str]:
data = self._request("GET", "/folders/")
return [f["name"] for f in data.get("folders", [])]
def delete_collection(self, name: str) -> None:
folder_id = self._get_folder_id(name)
if folder_id:
self._request("DELETE", f"/folder/{self._enc(folder_id)}/")
# ── Document management ───────────────────────────────────────────────
def add_document(self, collection: str, file_path: str) -> str:
folder_id = self._get_folder_id(collection)
data = {"if_retrieval": "true"}
if folder_id:
data["folder_id"] = folder_id
with open(file_path, "rb") as f:
resp = self._request("POST", "/doc/", files={"file": f}, data=data)
doc_id = resp["doc_id"]
# Poll until retrieval-ready
for _ in range(120): # 10 min max
tree_resp = self._request("GET", f"/doc/{self._enc(doc_id)}/", params={"type": "tree"})
if tree_resp.get("retrieval_ready"):
return doc_id
status = tree_resp.get("status", "")
if status == "failed":
raise CloudAPIError(f"Document {doc_id} indexing failed")
time.sleep(5)
raise CloudAPIError(f"Document {doc_id} indexing timed out")
def get_document(self, collection: str, doc_id: str, include_text: bool = False) -> dict:
resp = self._request("GET", f"/doc/{self._enc(doc_id)}/metadata/")
# Fetch structure in the same call via tree endpoint
tree_resp = self._request("GET", f"/doc/{self._enc(doc_id)}/",
params={"type": "tree", "summary": "true"})
raw_tree = tree_resp.get("tree", tree_resp.get("structure", tree_resp.get("result", [])))
return {
"doc_id": resp.get("id", doc_id),
"doc_name": resp.get("name", ""),
"doc_description": resp.get("description", ""),
"doc_type": "pdf",
"status": resp.get("status", ""),
"structure": self._normalize_tree(raw_tree),
}
def get_document_structure(self, collection: str, doc_id: str) -> list:
resp = self._request("GET", f"/doc/{self._enc(doc_id)}/", params={"type": "tree", "summary": "true"})
raw_tree = resp.get("tree", resp.get("structure", resp.get("result", [])))
return self._normalize_tree(raw_tree)
def get_page_content(self, collection: str, doc_id: str, pages: str) -> list:
resp = self._request("GET", f"/doc/{self._enc(doc_id)}/", params={"type": "ocr", "format": "page"})
# Filter to requested pages
from ..index.utils import parse_pages
page_nums = set(parse_pages(pages))
all_pages = resp.get("pages", resp.get("ocr", resp.get("result", [])))
if isinstance(all_pages, list):
return [
{"page": p.get("page", p.get("page_index")),
"content": p.get("content", p.get("markdown", ""))}
for p in all_pages
if p.get("page", p.get("page_index")) in page_nums
]
return []
@staticmethod
def _normalize_tree(nodes: list) -> list:
"""Normalize cloud tree nodes to match local schema."""
result = []
for node in nodes:
normalized = {
"title": node.get("title", ""),
"node_id": node.get("node_id", ""),
"summary": node.get("summary", node.get("prefix_summary", "")),
"start_index": node.get("start_index", node.get("page_index")),
"end_index": node.get("end_index", node.get("page_index")),
}
if "text" in node:
normalized["text"] = node["text"]
children = node.get("nodes", [])
if children:
normalized["nodes"] = CloudBackend._normalize_tree(children)
result.append(normalized)
return result
def list_documents(self, collection: str) -> list[dict]:
folder_id = self._get_folder_id(collection)
params = {"limit": 100}
if folder_id:
params["folder_id"] = folder_id
data = self._request("GET", "/docs/", params=params)
return [
{"doc_id": d.get("id", ""), "doc_name": d.get("name", ""), "doc_type": "pdf"}
for d in data.get("documents", [])
]
def delete_document(self, collection: str, doc_id: str) -> None:
self._request("DELETE", f"/doc/{self._enc(doc_id)}/")
# ── Query (uses cloud chat/completions, no LLM key needed) ────────────
def query(self, collection: str, question: str, doc_ids: list[str] | None = None) -> str:
"""Non-streaming query via cloud chat/completions."""
doc_id = doc_ids if doc_ids else self._get_all_doc_ids(collection)
resp = self._request("POST", "/chat/completions/", json={
"messages": [{"role": "user", "content": question}],
"doc_id": doc_id,
"stream": False,
})
# Extract answer from response
choices = resp.get("choices", [])
if choices:
return choices[0].get("message", {}).get("content", "")
return resp.get("content", resp.get("answer", ""))
async def query_stream(self, collection: str, question: str,
doc_ids: list[str] | None = None) -> AsyncIterator[QueryEvent]:
"""Streaming query via cloud chat/completions SSE.
Events are yielded in real-time as they arrive from the server.
A background thread handles the blocking HTTP stream and pushes
events through an asyncio.Queue for true async streaming.
"""
import asyncio
import threading
doc_id = doc_ids if doc_ids else self._get_all_doc_ids(collection)
headers = self._headers
queue: asyncio.Queue[QueryEvent | None] = asyncio.Queue()
loop = asyncio.get_event_loop()
def _stream():
"""Background thread: read SSE and push events to queue."""
resp = requests.post(
f"{API_BASE}/chat/completions/",
headers=headers,
json={
"messages": [{"role": "user", "content": question}],
"doc_id": doc_id,
"stream": True,
"stream_metadata": True,
},
stream=True,
timeout=120,
)
try:
if resp.status_code != 200:
body = resp.text[:500] if resp.text else ""
loop.call_soon_threadsafe(
queue.put_nowait,
QueryEvent(type="answer_done",
data=f"Cloud streaming error {resp.status_code}: {body}"),
)
return
current_tool_name = None
current_tool_args: list[str] = []
for line in resp.iter_lines(decode_unicode=True):
if not line or not line.startswith("data: "):
continue
data_str = line[6:]
if data_str.strip() == "[DONE]":
break
try:
chunk = json.loads(data_str)
except json.JSONDecodeError:
continue
meta = chunk.get("block_metadata", {})
block_type = meta.get("type", "")
choices = chunk.get("choices", [])
delta = choices[0].get("delta", {}) if choices else {}
content = delta.get("content", "")
if block_type == "mcp_tool_use_start":
current_tool_name = meta.get("tool_name", "")
current_tool_args = []
elif block_type == "tool_use":
if content:
current_tool_args.append(content)
elif block_type == "tool_use_stop":
if current_tool_name and current_tool_name not in _INTERNAL_TOOLS:
args_str = "".join(current_tool_args)
loop.call_soon_threadsafe(
queue.put_nowait,
QueryEvent(type="tool_call", data={
"name": current_tool_name,
"args": args_str,
}),
)
current_tool_name = None
current_tool_args = []
elif block_type == "text" and content:
loop.call_soon_threadsafe(
queue.put_nowait,
QueryEvent(type="answer_delta", data=content),
)
finally:
resp.close()
loop.call_soon_threadsafe(queue.put_nowait, None) # sentinel
thread = threading.Thread(target=_stream, daemon=True)
thread.start()
while True:
event = await queue.get()
if event is None:
break
yield event
thread.join(timeout=5)
def _get_all_doc_ids(self, collection: str) -> list[str]:
"""Get all document IDs in a collection."""
docs = self.list_documents(collection)
return [d["doc_id"] for d in docs]
# ── Not used in cloud mode ────────────────────────────────────────────
def get_agent_tools(self, collection: str, doc_ids: list[str] | None = None) -> AgentTools:
"""Not used in cloud mode — query goes through chat/completions."""
return AgentTools()

245
pageindex/backend/local.py Normal file
View file

@ -0,0 +1,245 @@
# pageindex/backend/local.py
import hashlib
import os
import re
import uuid
import shutil
from pathlib import Path
from ..parser.protocol import DocumentParser, ParsedDocument
from ..parser.pdf import PdfParser
from ..parser.markdown import MarkdownParser
from ..storage.protocol import StorageEngine
from ..index.pipeline import build_index
from ..index.utils import parse_pages, get_pdf_page_content, get_md_page_content, remove_fields
from ..backend.protocol import AgentTools
from ..errors import FileTypeError, DocumentNotFoundError, IndexingError, PageIndexError
_COLLECTION_NAME_RE = re.compile(r'^[a-zA-Z0-9_-]{1,128}$')
class LocalBackend:
def __init__(self, storage: StorageEngine, files_dir: str, model: str = None,
retrieve_model: str = None, index_config=None):
self._storage = storage
self._files_dir = Path(files_dir)
self._model = model
self._retrieve_model = retrieve_model or model
self._index_config = index_config
self._parsers: list[DocumentParser] = [PdfParser(), MarkdownParser()]
def register_parser(self, parser: DocumentParser) -> None:
self._parsers.insert(0, parser) # user parsers checked first
def get_retrieve_model(self) -> str | None:
return self._retrieve_model
def _resolve_parser(self, file_path: str) -> DocumentParser:
ext = os.path.splitext(file_path)[1].lower()
for parser in self._parsers:
if ext in parser.supported_extensions():
return parser
raise FileTypeError(f"No parser for extension: {ext}")
# Collection management
def _validate_collection_name(self, name: str) -> None:
if not _COLLECTION_NAME_RE.match(name):
raise PageIndexError(f"Invalid collection name: {name!r}. Must be 1-128 chars of [a-zA-Z0-9_-].")
def create_collection(self, name: str) -> None:
self._validate_collection_name(name)
self._storage.create_collection(name)
def get_or_create_collection(self, name: str) -> None:
self._validate_collection_name(name)
self._storage.get_or_create_collection(name)
def list_collections(self) -> list[str]:
return self._storage.list_collections()
def delete_collection(self, name: str) -> None:
self._storage.delete_collection(name)
col_dir = self._files_dir / name
if col_dir.exists():
shutil.rmtree(col_dir)
@staticmethod
def _file_hash(file_path: str) -> str:
"""Compute SHA-256 hash of a file."""
h = hashlib.sha256()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(65536), b""):
h.update(chunk)
return h.hexdigest()
# Document management
def add_document(self, collection: str, file_path: str) -> str:
file_path = os.path.realpath(file_path)
if not os.path.isfile(file_path):
raise FileTypeError(f"Not a regular file: {file_path}")
parser = self._resolve_parser(file_path)
# Dedup: skip if same file already indexed in this collection
file_hash = self._file_hash(file_path)
existing_id = self._storage.find_document_by_hash(collection, file_hash)
if existing_id:
return existing_id
doc_id = str(uuid.uuid4())
# Copy file to managed directory
ext = os.path.splitext(file_path)[1]
col_dir = self._files_dir / collection
col_dir.mkdir(parents=True, exist_ok=True)
managed_path = col_dir / f"{doc_id}{ext}"
shutil.copy2(file_path, managed_path)
try:
# Store images alongside the document: files/{collection}/{doc_id}/images/
images_dir = str(col_dir / doc_id / "images")
parsed = parser.parse(file_path, model=self._model, images_dir=images_dir)
result = build_index(parsed, model=self._model, opt=self._index_config)
# Cache page text for fast retrieval (avoids re-reading files)
pages = [{"page": n.index, "content": n.content,
**({"images": n.images} if n.images else {})}
for n in parsed.nodes if n.content]
# Strip text from structure to save storage space (PDF only;
# markdown needs text in structure for fallback retrieval)
doc_type = ext.lstrip(".")
if doc_type == "pdf":
clean_structure = remove_fields(result["structure"], fields=["text"])
else:
clean_structure = result["structure"]
self._storage.save_document(collection, doc_id, {
"doc_name": parsed.doc_name,
"doc_description": result.get("doc_description", ""),
"file_path": str(managed_path),
"file_hash": file_hash,
"doc_type": doc_type,
"structure": clean_structure,
"pages": pages,
})
except Exception as e:
managed_path.unlink(missing_ok=True)
doc_dir = col_dir / doc_id
if doc_dir.exists():
shutil.rmtree(doc_dir)
raise IndexingError(f"Failed to index {file_path}: {e}") from e
return doc_id
def get_document(self, collection: str, doc_id: str, include_text: bool = False) -> dict:
"""Get document metadata with structure.
Args:
include_text: If True, populate each structure node's 'text' field
from cached page content. WARNING: may be very large do NOT
use in agent/LLM contexts as it can exhaust the context window.
"""
doc = self._storage.get_document(collection, doc_id)
if not doc:
return {}
doc["structure"] = self._storage.get_document_structure(collection, doc_id)
if include_text:
pages = self._storage.get_pages(collection, doc_id) or []
page_map = {p["page"]: p["content"] for p in pages}
self._fill_node_text(doc["structure"], page_map)
return doc
@staticmethod
def _fill_node_text(nodes: list, page_map: dict) -> None:
"""Recursively fill 'text' on structure nodes from cached page content."""
for node in nodes:
start = node.get("start_index")
end = node.get("end_index")
if start is not None and end is not None:
node["text"] = "\n".join(
page_map.get(p, "") for p in range(start, end + 1)
)
if "nodes" in node:
LocalBackend._fill_node_text(node["nodes"], page_map)
def get_document_structure(self, collection: str, doc_id: str) -> list:
return self._storage.get_document_structure(collection, doc_id)
def get_page_content(self, collection: str, doc_id: str, pages: str) -> list:
doc = self._storage.get_document(collection, doc_id)
if not doc:
raise DocumentNotFoundError(f"Document {doc_id} not found")
page_nums = parse_pages(pages)
# Try cached pages first (fast, no file I/O)
cached_pages = self._storage.get_pages(collection, doc_id)
if cached_pages:
return [p for p in cached_pages if p["page"] in page_nums]
# Fallback to reading from file
if doc["doc_type"] == "pdf":
return get_pdf_page_content(doc["file_path"], page_nums)
else:
structure = self._storage.get_document_structure(collection, doc_id)
return get_md_page_content(structure, page_nums)
def list_documents(self, collection: str) -> list[dict]:
return self._storage.list_documents(collection)
def delete_document(self, collection: str, doc_id: str) -> None:
doc = self._storage.get_document(collection, doc_id)
if doc and doc.get("file_path"):
Path(doc["file_path"]).unlink(missing_ok=True)
# Clean up images directory: files/{collection}/{doc_id}/
doc_dir = self._files_dir / collection / doc_id
if doc_dir.exists():
shutil.rmtree(doc_dir)
self._storage.delete_document(collection, doc_id)
def get_agent_tools(self, collection: str, doc_ids: list[str] | None = None) -> AgentTools:
from agents import function_tool
import json
storage = self._storage
col_name = collection
backend = self
filter_ids = doc_ids
@function_tool
def list_documents() -> str:
"""List all documents in the collection."""
docs = storage.list_documents(col_name)
if filter_ids:
docs = [d for d in docs if d["doc_id"] in filter_ids]
return json.dumps(docs)
@function_tool
def get_document(doc_id: str) -> str:
"""Get document metadata."""
return json.dumps(storage.get_document(col_name, doc_id))
@function_tool
def get_document_structure(doc_id: str) -> str:
"""Get document tree structure (without text)."""
structure = storage.get_document_structure(col_name, doc_id)
return json.dumps(remove_fields(structure, fields=["text"]), ensure_ascii=False)
@function_tool
def get_page_content(doc_id: str, pages: str) -> str:
"""Get page content. Use tight ranges: '5-7', '3,8', '12'."""
result = backend.get_page_content(col_name, doc_id, pages)
return json.dumps(result, ensure_ascii=False)
return AgentTools(function_tools=[list_documents, get_document, get_document_structure, get_page_content])
def query(self, collection: str, question: str, doc_ids: list[str] | None = None) -> str:
from ..agent import AgentRunner
tools = self.get_agent_tools(collection, doc_ids)
return AgentRunner(tools=tools, model=self._retrieve_model).run(question)
async def query_stream(self, collection: str, question: str,
doc_ids: list[str] | None = None):
from ..agent import QueryStream
tools = self.get_agent_tools(collection, doc_ids)
stream = QueryStream(tools=tools, question=question, model=self._retrieve_model)
async for event in stream:
yield event

View file

@ -0,0 +1,34 @@
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Protocol, Any, AsyncIterator, runtime_checkable
from ..events import QueryEvent
@dataclass
class AgentTools:
"""Structured container for agent tool configuration (local mode only)."""
function_tools: list[Any] = field(default_factory=list)
mcp_servers: list[Any] = field(default_factory=list)
@runtime_checkable
class Backend(Protocol):
# Collection management
def create_collection(self, name: str) -> None: ...
def get_or_create_collection(self, name: str) -> None: ...
def list_collections(self) -> list[str]: ...
def delete_collection(self, name: str) -> None: ...
# Document management
def add_document(self, collection: str, file_path: str) -> str: ...
def get_document(self, collection: str, doc_id: str, include_text: bool = False) -> dict: ...
def get_document_structure(self, collection: str, doc_id: str) -> list: ...
def get_page_content(self, collection: str, doc_id: str, pages: str) -> list: ...
def list_documents(self, collection: str) -> list[dict]: ...
def delete_document(self, collection: str, doc_id: str) -> None: ...
# Query
def query(self, collection: str, question: str, doc_ids: list[str] | None = None) -> str: ...
async def query_stream(self, collection: str, question: str,
doc_ids: list[str] | None = None) -> AsyncIterator[QueryEvent]: ...