Add PageIndexClient with agent-based retrieval via OpenAI Agents SDK (#125)

* Add PageIndexClient with retrieve, streaming support and litellm integration * Add OpenAI agents demo example * Update README with example agent demo section * Support separate retrieve_model configuration for index and retrieve
2026-05-13 16:52:37 +02:00 · 2026-03-26 23:19:50 +08:00 · 2026-03-26 23:19:50 +08:00 · 5d4491f3bf
commit 5d4491f3bf
parent 2403be8f27
9 changed files with 501 additions and 7 deletions
--- a/pageindex/client.py
+++ b/pageindex/client.py
@ -0,0 +1,132 @@
+import os
+import uuid
+import json
+import asyncio
+import concurrent.futures
+from pathlib import Path
+
+from .page_index import page_index
+from .page_index_md import md_to_tree
+from .retrieve import get_document, get_document_structure, get_page_content
+from .utils import ConfigLoader
+
+class PageIndexClient:
+    """
+    A client for indexing and retrieving document content.
+    Flow: index() -> get_document() / get_document_structure() / get_page_content()
+
+    For agent-based QA, see examples/openai_agents_demo.py.
+    """
+    def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, workspace: str = None):
+        if api_key:
+            os.environ["OPENAI_API_KEY"] = api_key
+        elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"):
+            os.environ["OPENAI_API_KEY"] = os.getenv("CHATGPT_API_KEY")
+        self.workspace = Path(workspace).expanduser() if workspace else None
+        overrides = {}
+        if model:
+            overrides["model"] = model
+        if retrieve_model:
+            overrides["retrieve_model"] = retrieve_model
+        opt = ConfigLoader().load(overrides or None)
+        self.model = opt.model
+        self.retrieve_model = opt.retrieve_model or self.model
+        if self.workspace:
+            self.workspace.mkdir(parents=True, exist_ok=True)
+        self.documents = {}
+        if self.workspace:
+            self._load_workspace()
+
+    def index(self, file_path: str, mode: str = "auto") -> str:
+        """Index a document. Returns a document_id."""
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"File not found: {file_path}")
+
+        doc_id = str(uuid.uuid4())
+        ext = os.path.splitext(file_path)[1].lower()
+
+        is_pdf = ext == '.pdf'
+        is_md = ext in ['.md', '.markdown']
+
+        if mode == "pdf" or (mode == "auto" and is_pdf):
+            print(f"Indexing PDF: {file_path}")
+            result = page_index(
+                doc=file_path,
+                model=self.model,
+                if_add_node_summary='yes',
+                if_add_node_text='yes',
+                if_add_node_id='yes',
+                if_add_doc_description='yes'
+            )
+            self.documents[doc_id] = {
+                'id': doc_id,
+                'path': file_path,
+                'type': 'pdf',
+                'structure': result['structure'],
+                'doc_name': result.get('doc_name', ''),
+                'doc_description': result.get('doc_description', '')
+            }
+
+        elif mode == "md" or (mode == "auto" and is_md):
+            print(f"Indexing Markdown: {file_path}")
+            coro = md_to_tree(
+                md_path=file_path,
+                if_thinning=False,
+                if_add_node_summary='yes',
+                summary_token_threshold=200,
+                model=self.model,
+                if_add_doc_description='yes',
+                if_add_node_text='yes',
+                if_add_node_id='yes'
+            )
+            try:
+                asyncio.get_running_loop()
+                with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
+                    result = pool.submit(asyncio.run, coro).result()
+            except RuntimeError:
+                result = asyncio.run(coro)
+            self.documents[doc_id] = {
+                'id': doc_id,
+                'path': file_path,
+                'type': 'md',
+                'structure': result['structure'],
+                'doc_name': result.get('doc_name', ''),
+                'doc_description': result.get('doc_description', '')
+            }
+        else:
+            raise ValueError(f"Unsupported file format for: {file_path}")
+
+        print(f"Indexing complete. Document ID: {doc_id}")
+        if self.workspace:
+            self._save_doc(doc_id)
+        return doc_id
+
+    def _save_doc(self, doc_id: str):
+        path = self.workspace / f"{doc_id}.json"
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump(self.documents[doc_id], f, ensure_ascii=False, indent=2)
+
+    def _load_workspace(self):
+        loaded = 0
+        for path in self.workspace.glob("*.json"):
+            try:
+                with open(path, "r", encoding="utf-8") as f:
+                    doc = json.load(f)
+                self.documents[path.stem] = doc
+                loaded += 1
+            except (json.JSONDecodeError, OSError) as e:
+                print(f"Warning: skipping corrupt workspace file {path.name}: {e}")
+        if loaded:
+            print(f"Loaded {loaded} document(s) from workspace.")
+
+    def get_document(self, doc_id: str) -> str:
+        """Return document metadata JSON."""
+        return get_document(self.documents, doc_id)
+
+    def get_document_structure(self, doc_id: str) -> str:
+        """Return document tree structure JSON (without text fields)."""
+        return get_document_structure(self.documents, doc_id)
+
+    def get_page_content(self, doc_id: str, pages: str) -> str:
+        """Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
+        return get_page_content(self.documents, doc_id, pages)