feat: add PageIndex SDK with local/cloud dual-mode support (#207)

2026-06-21 20:18:09 +02:00 · 2026-04-06 22:51:04 +08:00 · 2026-04-06 22:51:04 +08:00 · c7fe93bb56
commit c7fe93bb56
parent f2dcffc0b7
45 changed files with 4225 additions and 274 deletions
--- a/examples/cloud_demo.py
+++ b/examples/cloud_demo.py
@ -0,0 +1,62 @@
+"""
+Agentic Vectorless RAG with PageIndex SDK - Cloud Demo
+
+Uses CloudClient for fully-managed document indexing and QA.
+No LLM API key needed — the cloud service handles everything.
+
+Steps:
+  1 — Upload and index a PDF via PageIndex cloud
+  2 — Stream a question with tool call visibility
+
+Requirements:
+    pip install pageindex
+    export PAGEINDEX_API_KEY=your-api-key
+"""
+import asyncio
+import os
+from pathlib import Path
+import requests
+from pageindex import CloudClient
+
+_EXAMPLES_DIR = Path(__file__).parent
+PDF_URL = "https://arxiv.org/pdf/1706.03762.pdf"
+PDF_PATH = _EXAMPLES_DIR / "documents" / "attention.pdf"
+
+# Download PDF if needed
+if not PDF_PATH.exists():
+    print(f"Downloading {PDF_URL} ...")
+    PDF_PATH.parent.mkdir(parents=True, exist_ok=True)
+    with requests.get(PDF_URL, stream=True, timeout=30) as r:
+        r.raise_for_status()
+        with open(PDF_PATH, "wb") as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                if chunk:
+                    f.write(chunk)
+    print("Download complete.\n")
+
+client = CloudClient(api_key=os.environ["PAGEINDEX_API_KEY"])
+col = client.collection()
+
+doc_id = col.add(str(PDF_PATH))
+print(f"Indexed: {doc_id}\n")
+
+# Streaming query
+stream = col.query("What is the main contribution of this paper?", stream=True)
+
+async def main():
+    streamed_text = False
+    async for event in stream:
+        if event.type == "answer_delta":
+            print(event.data, end="", flush=True)
+            streamed_text = True
+        elif event.type == "tool_call":
+            if streamed_text:
+                print()
+                streamed_text = False
+            args = event.data.get("args", "")
+            print(f"[tool call] {event.data['name']}({args})")
+        elif event.type == "answer_done":
+            print()
+            streamed_text = False
+
+asyncio.run(main())