From ce9cbc2ed05432e8a16bec42304dc26c2745e997 Mon Sep 17 00:00:00 2001 From: Ray Date: Sun, 29 Mar 2026 04:56:27 +0800 Subject: [PATCH] Polish demo docstring and migrate to pathlib --- examples/agentic_vectorless_rag_demo.py | 49 +++++++++++++------------ 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/examples/agentic_vectorless_rag_demo.py b/examples/agentic_vectorless_rag_demo.py index c8d0674..1c5b9d4 100644 --- a/examples/agentic_vectorless_rag_demo.py +++ b/examples/agentic_vectorless_rag_demo.py @@ -1,28 +1,31 @@ """ -PageIndex x OpenAI Agents Demo +Agentic Vectorless RAG with PageIndex - Demo -Demonstrates how to use PageIndexClient with the OpenAI Agents SDK -to build a document QA agent with 3 tools: - - get_document() - - get_document_structure() - - get_page_content() +A simple example of building a document QA agent with self-hosted PageIndex +and the OpenAI Agents SDK. Instead of vector similarity search and chunking, +PageIndex builds a hierarchical tree index and uses agentic LLM reasoning for +human-like, context-aware retrieval. -Requirements: - pip install openai-agents +Agent tools: + - get_document() — document metadata (status, page count, etc.) + - get_document_structure() — tree structure index of a document + - get_page_content() — retrieve text content of specific pages Steps: - 1 — Index PDF and inspect tree structure - 2 — Inspect document metadata - 3 — Ask a question (agent auto-calls tools) + 1 — Index a PDF and view its tree structure index + 2 — View document metadata + 3 — Ask a question (agent reasons over the index and auto-calls tools) + +Requirements: pip install openai-agents """ -import os import sys import json import asyncio import concurrent.futures +from pathlib import Path import requests -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.insert(0, str(Path(__file__).parent.parent)) from agents import Agent, Runner, function_tool, set_tracing_disabled from agents.model_settings import ModelSettings @@ -34,9 +37,9 @@ import pageindex.utils as utils PDF_URL = "https://arxiv.org/pdf/2603.15031" -_EXAMPLES_DIR = os.path.dirname(os.path.abspath(__file__)) -PDF_PATH = os.path.join(_EXAMPLES_DIR, "documents", "attention-residuals.pdf") -WORKSPACE = os.path.join(_EXAMPLES_DIR, "workspace") +_EXAMPLES_DIR = Path(__file__).parent +PDF_PATH = _EXAMPLES_DIR / "documents" / "attention-residuals.pdf" +WORKSPACE = _EXAMPLES_DIR / "workspace" AGENT_SYSTEM_PROMPT = """ You are PageIndex, a document QA assistant. @@ -138,9 +141,9 @@ if __name__ == "__main__": set_tracing_disabled(True) # Download PDF if needed - if not os.path.exists(PDF_PATH): + if not PDF_PATH.exists(): print(f"Downloading {PDF_URL} ...") - os.makedirs(os.path.dirname(PDF_PATH), exist_ok=True) + PDF_PATH.parent.mkdir(parents=True, exist_ok=True) with requests.get(PDF_URL, stream=True, timeout=30) as r: r.raise_for_status() with open(PDF_PATH, "wb") as f: @@ -152,12 +155,12 @@ if __name__ == "__main__": # Setup client = PageIndexClient(workspace=WORKSPACE) - # Step 1: Index + Tree + # Step 1: Index PDF and view tree structure print("=" * 60) - print("Step 1: Indexing PDF and inspecting tree structure") + print("Step 1: Index PDF and view tree structure") print("=" * 60) doc_id = next( - (did for did, doc in client.documents.items() if doc.get('doc_name') == os.path.basename(PDF_PATH)), + (did for did, doc in client.documents.items() if doc.get('doc_name') == PDF_PATH.name), None, ) if doc_id: @@ -169,9 +172,9 @@ if __name__ == "__main__": structure = json.loads(client.get_document_structure(doc_id)) utils.print_tree(structure) - # Step 2: Document Metadata + # Step 2: View document metadata print("\n" + "=" * 60) - print("Step 2: Document Metadata (get_document)") + print("Step 2: View document metadata") print("=" * 60) doc_metadata = client.get_document(doc_id) print(f"\n{doc_metadata}")