Merge pull request #197 from VectifyAI/polish/demo-docstring-and-pathlib

Polish demo docstring and migrate to pathlib
This commit is contained in:
Ray 2026-03-29 05:03:50 +08:00 committed by GitHub
commit 54542f03e6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1,28 +1,31 @@
""" """
PageIndex x OpenAI Agents Demo Agentic Vectorless RAG with PageIndex - Demo
Demonstrates how to use PageIndexClient with the OpenAI Agents SDK A simple example of building a document QA agent with self-hosted PageIndex
to build a document QA agent with 3 tools: and the OpenAI Agents SDK. Instead of vector similarity search and chunking,
- get_document() PageIndex builds a hierarchical tree index and uses agentic LLM reasoning for
- get_document_structure() human-like, context-aware retrieval.
- get_page_content()
Requirements: Agent tools:
pip install openai-agents - get_document() document metadata (status, page count, etc.)
- get_document_structure() tree structure index of a document
- get_page_content() retrieve text content of specific pages
Steps: Steps:
1 Index PDF and inspect tree structure 1 Index a PDF and view its tree structure index
2 Inspect document metadata 2 View document metadata
3 Ask a question (agent auto-calls tools) 3 Ask a question (agent reasons over the index and auto-calls tools)
Requirements: pip install openai-agents
""" """
import os
import sys import sys
import json import json
import asyncio import asyncio
import concurrent.futures import concurrent.futures
from pathlib import Path
import requests import requests
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, str(Path(__file__).parent.parent))
from agents import Agent, Runner, function_tool, set_tracing_disabled from agents import Agent, Runner, function_tool, set_tracing_disabled
from agents.model_settings import ModelSettings from agents.model_settings import ModelSettings
@ -34,9 +37,9 @@ import pageindex.utils as utils
PDF_URL = "https://arxiv.org/pdf/2603.15031" PDF_URL = "https://arxiv.org/pdf/2603.15031"
_EXAMPLES_DIR = os.path.dirname(os.path.abspath(__file__)) _EXAMPLES_DIR = Path(__file__).parent
PDF_PATH = os.path.join(_EXAMPLES_DIR, "documents", "attention-residuals.pdf") PDF_PATH = _EXAMPLES_DIR / "documents" / "attention-residuals.pdf"
WORKSPACE = os.path.join(_EXAMPLES_DIR, "workspace") WORKSPACE = _EXAMPLES_DIR / "workspace"
AGENT_SYSTEM_PROMPT = """ AGENT_SYSTEM_PROMPT = """
You are PageIndex, a document QA assistant. You are PageIndex, a document QA assistant.
@ -138,9 +141,9 @@ if __name__ == "__main__":
set_tracing_disabled(True) set_tracing_disabled(True)
# Download PDF if needed # Download PDF if needed
if not os.path.exists(PDF_PATH): if not PDF_PATH.exists():
print(f"Downloading {PDF_URL} ...") print(f"Downloading {PDF_URL} ...")
os.makedirs(os.path.dirname(PDF_PATH), exist_ok=True) PDF_PATH.parent.mkdir(parents=True, exist_ok=True)
with requests.get(PDF_URL, stream=True, timeout=30) as r: with requests.get(PDF_URL, stream=True, timeout=30) as r:
r.raise_for_status() r.raise_for_status()
with open(PDF_PATH, "wb") as f: with open(PDF_PATH, "wb") as f:
@ -152,12 +155,12 @@ if __name__ == "__main__":
# Setup # Setup
client = PageIndexClient(workspace=WORKSPACE) client = PageIndexClient(workspace=WORKSPACE)
# Step 1: Index + Tree # Step 1: Index PDF and view tree structure
print("=" * 60) print("=" * 60)
print("Step 1: Indexing PDF and inspecting tree structure") print("Step 1: Index PDF and view tree structure")
print("=" * 60) print("=" * 60)
doc_id = next( doc_id = next(
(did for did, doc in client.documents.items() if doc.get('doc_name') == os.path.basename(PDF_PATH)), (did for did, doc in client.documents.items() if doc.get('doc_name') == PDF_PATH.name),
None, None,
) )
if doc_id: if doc_id:
@ -169,9 +172,9 @@ if __name__ == "__main__":
structure = json.loads(client.get_document_structure(doc_id)) structure = json.loads(client.get_document_structure(doc_id))
utils.print_tree(structure) utils.print_tree(structure)
# Step 2: Document Metadata # Step 2: View document metadata
print("\n" + "=" * 60) print("\n" + "=" * 60)
print("Step 2: Document Metadata (get_document)") print("Step 2: View document metadata")
print("=" * 60) print("=" * 60)
doc_metadata = client.get_document(doc_id) doc_metadata = client.get_document(doc_id)
print(f"\n{doc_metadata}") print(f"\n{doc_metadata}")