mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-04-24 23:56:21 +02:00
Merge pull request #197 from VectifyAI/polish/demo-docstring-and-pathlib
Polish demo docstring and migrate to pathlib
This commit is contained in:
commit
54542f03e6
1 changed files with 26 additions and 23 deletions
|
|
@ -1,28 +1,31 @@
|
||||||
"""
|
"""
|
||||||
PageIndex x OpenAI Agents Demo
|
Agentic Vectorless RAG with PageIndex - Demo
|
||||||
|
|
||||||
Demonstrates how to use PageIndexClient with the OpenAI Agents SDK
|
A simple example of building a document QA agent with self-hosted PageIndex
|
||||||
to build a document QA agent with 3 tools:
|
and the OpenAI Agents SDK. Instead of vector similarity search and chunking,
|
||||||
- get_document()
|
PageIndex builds a hierarchical tree index and uses agentic LLM reasoning for
|
||||||
- get_document_structure()
|
human-like, context-aware retrieval.
|
||||||
- get_page_content()
|
|
||||||
|
|
||||||
Requirements:
|
Agent tools:
|
||||||
pip install openai-agents
|
- get_document() — document metadata (status, page count, etc.)
|
||||||
|
- get_document_structure() — tree structure index of a document
|
||||||
|
- get_page_content() — retrieve text content of specific pages
|
||||||
|
|
||||||
Steps:
|
Steps:
|
||||||
1 — Index PDF and inspect tree structure
|
1 — Index a PDF and view its tree structure index
|
||||||
2 — Inspect document metadata
|
2 — View document metadata
|
||||||
3 — Ask a question (agent auto-calls tools)
|
3 — Ask a question (agent reasons over the index and auto-calls tools)
|
||||||
|
|
||||||
|
Requirements: pip install openai-agents
|
||||||
"""
|
"""
|
||||||
import os
|
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
|
from pathlib import Path
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
from agents import Agent, Runner, function_tool, set_tracing_disabled
|
from agents import Agent, Runner, function_tool, set_tracing_disabled
|
||||||
from agents.model_settings import ModelSettings
|
from agents.model_settings import ModelSettings
|
||||||
|
|
@ -34,9 +37,9 @@ import pageindex.utils as utils
|
||||||
|
|
||||||
PDF_URL = "https://arxiv.org/pdf/2603.15031"
|
PDF_URL = "https://arxiv.org/pdf/2603.15031"
|
||||||
|
|
||||||
_EXAMPLES_DIR = os.path.dirname(os.path.abspath(__file__))
|
_EXAMPLES_DIR = Path(__file__).parent
|
||||||
PDF_PATH = os.path.join(_EXAMPLES_DIR, "documents", "attention-residuals.pdf")
|
PDF_PATH = _EXAMPLES_DIR / "documents" / "attention-residuals.pdf"
|
||||||
WORKSPACE = os.path.join(_EXAMPLES_DIR, "workspace")
|
WORKSPACE = _EXAMPLES_DIR / "workspace"
|
||||||
|
|
||||||
AGENT_SYSTEM_PROMPT = """
|
AGENT_SYSTEM_PROMPT = """
|
||||||
You are PageIndex, a document QA assistant.
|
You are PageIndex, a document QA assistant.
|
||||||
|
|
@ -138,9 +141,9 @@ if __name__ == "__main__":
|
||||||
set_tracing_disabled(True)
|
set_tracing_disabled(True)
|
||||||
|
|
||||||
# Download PDF if needed
|
# Download PDF if needed
|
||||||
if not os.path.exists(PDF_PATH):
|
if not PDF_PATH.exists():
|
||||||
print(f"Downloading {PDF_URL} ...")
|
print(f"Downloading {PDF_URL} ...")
|
||||||
os.makedirs(os.path.dirname(PDF_PATH), exist_ok=True)
|
PDF_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with requests.get(PDF_URL, stream=True, timeout=30) as r:
|
with requests.get(PDF_URL, stream=True, timeout=30) as r:
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
with open(PDF_PATH, "wb") as f:
|
with open(PDF_PATH, "wb") as f:
|
||||||
|
|
@ -152,12 +155,12 @@ if __name__ == "__main__":
|
||||||
# Setup
|
# Setup
|
||||||
client = PageIndexClient(workspace=WORKSPACE)
|
client = PageIndexClient(workspace=WORKSPACE)
|
||||||
|
|
||||||
# Step 1: Index + Tree
|
# Step 1: Index PDF and view tree structure
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print("Step 1: Indexing PDF and inspecting tree structure")
|
print("Step 1: Index PDF and view tree structure")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
doc_id = next(
|
doc_id = next(
|
||||||
(did for did, doc in client.documents.items() if doc.get('doc_name') == os.path.basename(PDF_PATH)),
|
(did for did, doc in client.documents.items() if doc.get('doc_name') == PDF_PATH.name),
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
if doc_id:
|
if doc_id:
|
||||||
|
|
@ -169,9 +172,9 @@ if __name__ == "__main__":
|
||||||
structure = json.loads(client.get_document_structure(doc_id))
|
structure = json.loads(client.get_document_structure(doc_id))
|
||||||
utils.print_tree(structure)
|
utils.print_tree(structure)
|
||||||
|
|
||||||
# Step 2: Document Metadata
|
# Step 2: View document metadata
|
||||||
print("\n" + "=" * 60)
|
print("\n" + "=" * 60)
|
||||||
print("Step 2: Document Metadata (get_document)")
|
print("Step 2: View document metadata")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
doc_metadata = client.get_document(doc_id)
|
doc_metadata = client.get_document(doc_id)
|
||||||
print(f"\n{doc_metadata}")
|
print(f"\n{doc_metadata}")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue