PageIndex/examples/openai_agents_demo.py

"""
PageIndex x OpenAI Agents Demo

Demonstrates how to use PageIndexClient with the OpenAI Agents SDK
to build a document QA agent with 3 tools:
  - get_document()
  - get_document_structure()
  - get_page_content()

Requirements:
    pip install openai-agents

Steps:
  1 — Index PDF and inspect tree structure
  2 — Inspect document metadata
  3 — Ask a question (agent auto-calls tools)
  4 — Reload from workspace and verify persistence
"""
import os
import sys
import asyncio
import concurrent.futures
import requests
from pathlib import Path

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from agents import Agent, ItemHelpers, Runner, function_tool
from agents.stream_events import RawResponsesStreamEvent, RunItemStreamEvent
from openai.types.responses import ResponseTextDeltaEvent, ResponseReasoningSummaryTextDeltaEvent  # noqa: F401

from pageindex import PageIndexClient
import pageindex.utils as utils

PDF_URL = "https://arxiv.org/pdf/2501.12948.pdf"
PDF_PATH = "tests/pdfs/deepseek-r1.pdf"
WORKSPACE = "./pageindex_workspace"

AGENT_SYSTEM_PROMPT = """
You are PageIndex, a document QA assistant.
TOOL USE:
- Call get_document() first to confirm status and page/line count.
- Call get_document_structure() to find relevant page ranges (use node summaries and start_index/end_index).
- Call get_page_content(pages="5-7") with tight ranges. Never fetch the whole doc.
- When calling tool call, output one short sentence explaining reason.
ANSWERING: Answer based only on tool output. Be concise.
"""


def query_agent(
    client: PageIndexClient,
    doc_id: str,
    prompt: str,
    verbose: bool = False,
) -> str:
    """Run a document QA agent using the OpenAI Agents SDK.

    Streams text output token-by-token and returns the full answer string.
    Tool calls are always printed; verbose=True also prints arguments and output previews.
    """

    @function_tool
    def get_document() -> str:
        """Get document metadata: status, page count, name, and description."""
        return client.get_document(doc_id)

    @function_tool
    def get_document_structure() -> str:
        """Get the document's full tree structure (without text) to find relevant sections."""
        return client.get_document_structure(doc_id)

    @function_tool
    def get_page_content(pages: str) -> str:
        """
        Get the text content of specific pages or line numbers.
        Use tight ranges: e.g. '5-7' for pages 5 to 7, '3,8' for pages 3 and 8, '12' for page 12.
        For Markdown documents, use line numbers from the structure's line_num field.
        """
        return client.get_page_content(doc_id, pages)

    agent = Agent(
        name="PageIndex",
        instructions=AGENT_SYSTEM_PROMPT,
        tools=[get_document, get_document_structure, get_page_content],
        model=client.retrieve_model,
    )

    async def _run():
        collected = []
        streamed_this_turn = False
        streamed_run = Runner.run_streamed(agent, prompt)
        async for event in streamed_run.stream_events():
            if isinstance(event, RawResponsesStreamEvent):
                if isinstance(event.data, ResponseReasoningSummaryTextDeltaEvent):
                    print(event.data.delta, end="", flush=True)
                elif isinstance(event.data, ResponseTextDeltaEvent):
                    delta = event.data.delta
                    print(delta, end="", flush=True)
                    collected.append(delta)
                    streamed_this_turn = True
            elif isinstance(event, RunItemStreamEvent):
                item = event.item
                if item.type == "message_output_item":
                    if not streamed_this_turn:
                        text = ItemHelpers.text_message_output(item)
                        if text:
                            print(f"{text}")
                    streamed_this_turn = False
                    collected.clear()
                elif item.type == "tool_call_item":
                    if streamed_this_turn:
                        print()  # end streaming line before tool call
                    raw = item.raw_item
                    args = getattr(raw, "arguments", "{}")
                    args_str = f"({args})" if verbose else ""
                    print(f"[tool call]: {raw.name}{args_str}")
                elif item.type == "tool_call_output_item" and verbose:
                    output = str(item.output)
                    preview = output[:200] + "..." if len(output) > 200 else output
                    print(f"[tool output]: {preview}\n")
        return "".join(collected)

    try:
        asyncio.get_running_loop()
        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
            return pool.submit(asyncio.run, _run()).result()
    except RuntimeError:
        return asyncio.run(_run())


# ── Download PDF if needed ─────────────────────────────────────────────────────
if not os.path.exists(PDF_PATH):
    print(f"Downloading {PDF_URL} ...")
    os.makedirs(os.path.dirname(PDF_PATH), exist_ok=True)
    with requests.get(PDF_URL, stream=True, timeout=30) as r:
        r.raise_for_status()
        with open(PDF_PATH, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
    print("Download complete.\n")

# ── Setup ──────────────────────────────────────────────────────────────────────
client = PageIndexClient(workspace=WORKSPACE)

# ── Step 1: Index + Tree ───────────────────────────────────────────────────────
print("=" * 60)
print("Step 1: Indexing PDF and inspecting tree structure")
print("=" * 60)
_id_cache = Path(WORKSPACE).expanduser() / "demo_doc_id.txt"
if _id_cache.exists() and (doc_id := _id_cache.read_text().strip()) in client.documents:
    print(f"\nLoaded cached doc_id: {doc_id}")
else:
    doc_id = client.index(PDF_PATH)
    _id_cache.parent.mkdir(parents=True, exist_ok=True)
    _id_cache.write_text(doc_id)
    print(f"\nIndexed. doc_id: {doc_id}")
print("\nTree Structure (top-level sections):")
utils.print_tree(client.documents[doc_id]["structure"])

# ── Step 2: Document Metadata ──────────────────────────────────────────────────
print("\n" + "=" * 60)
print("Step 2: Document Metadata (get_document)")
print("=" * 60)
print(client.get_document(doc_id))

# ── Step 3: Agent Query ────────────────────────────────────────────────────────
print("\n" + "=" * 60)
print("Step 3: Agent Query (auto tool-use)")
print("=" * 60)
question = "What reward design does DeepSeek-R1-Zero use, and why was it chosen over supervised fine-tuning?"
print(f"\nQuestion: '{question}'\n")
query_agent(client, doc_id, question, verbose=True)
Add PageIndexClient with agent-based retrieval via OpenAI Agents SDK (#125) * Add PageIndexClient with retrieve, streaming support and litellm integration * Add OpenAI agents demo example * Update README with example agent demo section * Support separate retrieve_model configuration for index and retrieve 2026-03-26 23:19:50 +08:00			`"""`
			`PageIndex x OpenAI Agents Demo`

			`Demonstrates how to use PageIndexClient with the OpenAI Agents SDK`
			`to build a document QA agent with 3 tools:`
			`- get_document()`
			`- get_document_structure()`
			`- get_page_content()`

			`Requirements:`
			`pip install openai-agents`

			`Steps:`
			`1 — Index PDF and inspect tree structure`
			`2 — Inspect document metadata`
			`3 — Ask a question (agent auto-calls tools)`
			`4 — Reload from workspace and verify persistence`
			`"""`
			`import os`
			`import sys`
			`import asyncio`
			`import concurrent.futures`
			`import requests`
			`from pathlib import Path`

			`sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))`

			`from agents import Agent, ItemHelpers, Runner, function_tool`
			`from agents.stream_events import RawResponsesStreamEvent, RunItemStreamEvent`
			`from openai.types.responses import ResponseTextDeltaEvent, ResponseReasoningSummaryTextDeltaEvent # noqa: F401`

			`from pageindex import PageIndexClient`
			`import pageindex.utils as utils`

			`PDF_URL = "https://arxiv.org/pdf/2501.12948.pdf"`
			`PDF_PATH = "tests/pdfs/deepseek-r1.pdf"`
			`WORKSPACE = "./pageindex_workspace"`

			`AGENT_SYSTEM_PROMPT = """`
			`You are PageIndex, a document QA assistant.`
			`TOOL USE:`
			`- Call get_document() first to confirm status and page/line count.`
			`- Call get_document_structure() to find relevant page ranges (use node summaries and start_index/end_index).`
			`- Call get_page_content(pages="5-7") with tight ranges. Never fetch the whole doc.`
			`- When calling tool call, output one short sentence explaining reason.`
			`ANSWERING: Answer based only on tool output. Be concise.`
			`"""`


			`def query_agent(`
			`client: PageIndexClient,`
			`doc_id: str,`
			`prompt: str,`
			`verbose: bool = False,`
			`) -> str:`
			`"""Run a document QA agent using the OpenAI Agents SDK.`

			`Streams text output token-by-token and returns the full answer string.`
			`Tool calls are always printed; verbose=True also prints arguments and output previews.`
			`"""`

			`@function_tool`
			`def get_document() -> str:`
			`"""Get document metadata: status, page count, name, and description."""`
			`return client.get_document(doc_id)`

			`@function_tool`
			`def get_document_structure() -> str:`
			`"""Get the document's full tree structure (without text) to find relevant sections."""`
			`return client.get_document_structure(doc_id)`

			`@function_tool`
			`def get_page_content(pages: str) -> str:`
			`"""`
			`Get the text content of specific pages or line numbers.`
			`Use tight ranges: e.g. '5-7' for pages 5 to 7, '3,8' for pages 3 and 8, '12' for page 12.`
			`For Markdown documents, use line numbers from the structure's line_num field.`
			`"""`
			`return client.get_page_content(doc_id, pages)`

			`agent = Agent(`
			`name="PageIndex",`
			`instructions=AGENT_SYSTEM_PROMPT,`
			`tools=[get_document, get_document_structure, get_page_content],`
			`model=client.retrieve_model,`
			`)`

			`async def _run():`
			`collected = []`
			`streamed_this_turn = False`
			`streamed_run = Runner.run_streamed(agent, prompt)`
			`async for event in streamed_run.stream_events():`
			`if isinstance(event, RawResponsesStreamEvent):`
			`if isinstance(event.data, ResponseReasoningSummaryTextDeltaEvent):`
			`print(event.data.delta, end="", flush=True)`
			`elif isinstance(event.data, ResponseTextDeltaEvent):`
			`delta = event.data.delta`
			`print(delta, end="", flush=True)`
			`collected.append(delta)`
			`streamed_this_turn = True`
			`elif isinstance(event, RunItemStreamEvent):`
			`item = event.item`
			`if item.type == "message_output_item":`
			`if not streamed_this_turn:`
			`text = ItemHelpers.text_message_output(item)`
			`if text:`
			`print(f"{text}")`
			`streamed_this_turn = False`
			`collected.clear()`
			`elif item.type == "tool_call_item":`
			`if streamed_this_turn:`
			`print() # end streaming line before tool call`
			`raw = item.raw_item`
			`args = getattr(raw, "arguments", "{}")`
			`args_str = f"({args})" if verbose else ""`
			`print(f"[tool call]: {raw.name}{args_str}")`
			`elif item.type == "tool_call_output_item" and verbose:`
			`output = str(item.output)`
			`preview = output[:200] + "..." if len(output) > 200 else output`
			`print(f"[tool output]: {preview}\n")`
			`return "".join(collected)`

			`try:`
			`asyncio.get_running_loop()`
			`with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:`
			`return pool.submit(asyncio.run, _run()).result()`
			`except RuntimeError:`
			`return asyncio.run(_run())`


			`# ── Download PDF if needed ─────────────────────────────────────────────────────`
			`if not os.path.exists(PDF_PATH):`
			`print(f"Downloading {PDF_URL} ...")`
			`os.makedirs(os.path.dirname(PDF_PATH), exist_ok=True)`
			`with requests.get(PDF_URL, stream=True, timeout=30) as r:`
			`r.raise_for_status()`
			`with open(PDF_PATH, "wb") as f:`
			`for chunk in r.iter_content(chunk_size=8192):`
			`if chunk:`
			`f.write(chunk)`
			`print("Download complete.\n")`

			`# ── Setup ──────────────────────────────────────────────────────────────────────`
			`client = PageIndexClient(workspace=WORKSPACE)`

			`# ── Step 1: Index + Tree ───────────────────────────────────────────────────────`
			`print("=" * 60)`
			`print("Step 1: Indexing PDF and inspecting tree structure")`
			`print("=" * 60)`
			`_id_cache = Path(WORKSPACE).expanduser() / "demo_doc_id.txt"`
			`if _id_cache.exists() and (doc_id := _id_cache.read_text().strip()) in client.documents:`
			`print(f"\nLoaded cached doc_id: {doc_id}")`
			`else:`
			`doc_id = client.index(PDF_PATH)`
			`_id_cache.parent.mkdir(parents=True, exist_ok=True)`
			`_id_cache.write_text(doc_id)`
			`print(f"\nIndexed. doc_id: {doc_id}")`
			`print("\nTree Structure (top-level sections):")`
			`utils.print_tree(client.documents[doc_id]["structure"])`

			`# ── Step 2: Document Metadata ──────────────────────────────────────────────────`
			`print("\n" + "=" * 60)`
			`print("Step 2: Document Metadata (get_document)")`
			`print("=" * 60)`
			`print(client.get_document(doc_id))`

			`# ── Step 3: Agent Query ────────────────────────────────────────────────────────`
			`print("\n" + "=" * 60)`
			`print("Step 3: Agent Query (auto tool-use)")`
			`print("=" * 60)`
			`question = "What reward design does DeepSeek-R1-Zero use, and why was it chosen over supervised fine-tuning?"`
			`print(f"\nQuestion: '{question}'\n")`
			`query_agent(client, doc_id, question, verbose=True)`