PageIndex/examples/cloud_demo.py

62 lines
1.8 KiB
Python

"""
Agentic Vectorless RAG with PageIndex SDK - Cloud Demo
Uses CloudClient for fully-managed document indexing and QA.
No LLM API key needed — the cloud service handles everything.
Steps:
1 — Upload and index a PDF via PageIndex cloud
2 — Stream a question with tool call visibility
Requirements:
pip install pageindex
export PAGEINDEX_API_KEY=your-api-key
"""
import asyncio
import os
from pathlib import Path
import requests
from pageindex import CloudClient
_EXAMPLES_DIR = Path(__file__).parent
PDF_URL = "https://arxiv.org/pdf/1706.03762.pdf"
PDF_PATH = _EXAMPLES_DIR / "documents" / "attention.pdf"
# Download PDF if needed
if not PDF_PATH.exists():
print(f"Downloading {PDF_URL} ...")
PDF_PATH.parent.mkdir(parents=True, exist_ok=True)
with requests.get(PDF_URL, stream=True, timeout=30) as r:
r.raise_for_status()
with open(PDF_PATH, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print("Download complete.\n")
client = CloudClient(api_key=os.environ["PAGEINDEX_API_KEY"])
col = client.collection()
doc_id = col.add(str(PDF_PATH))
print(f"Indexed: {doc_id}\n")
# Streaming query
stream = col.query("What is the main contribution of this paper?", stream=True)
async def main():
streamed_text = False
async for event in stream:
if event.type == "answer_delta":
print(event.data, end="", flush=True)
streamed_text = True
elif event.type == "tool_call":
if streamed_text:
print()
streamed_text = False
args = event.data.get("args", "")
print(f"[tool call] {event.data['name']}({args})")
elif event.type == "answer_done":
print()
streamed_text = False
asyncio.run(main())