mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-05-13 16:52:37 +02:00
Add PageIndexClient with agent-based retrieval via OpenAI Agents SDK (#125)
* Add PageIndexClient with retrieve, streaming support and litellm integration * Add OpenAI agents demo example * Update README with example agent demo section * Support separate retrieve_model configuration for index and retrieve
This commit is contained in:
parent
2403be8f27
commit
5d4491f3bf
9 changed files with 501 additions and 7 deletions
132
pageindex/client.py
Normal file
132
pageindex/client.py
Normal file
|
|
@ -0,0 +1,132 @@
|
|||
import os
|
||||
import uuid
|
||||
import json
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
from pathlib import Path
|
||||
|
||||
from .page_index import page_index
|
||||
from .page_index_md import md_to_tree
|
||||
from .retrieve import get_document, get_document_structure, get_page_content
|
||||
from .utils import ConfigLoader
|
||||
|
||||
class PageIndexClient:
|
||||
"""
|
||||
A client for indexing and retrieving document content.
|
||||
Flow: index() -> get_document() / get_document_structure() / get_page_content()
|
||||
|
||||
For agent-based QA, see examples/openai_agents_demo.py.
|
||||
"""
|
||||
def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, workspace: str = None):
|
||||
if api_key:
|
||||
os.environ["OPENAI_API_KEY"] = api_key
|
||||
elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"):
|
||||
os.environ["OPENAI_API_KEY"] = os.getenv("CHATGPT_API_KEY")
|
||||
self.workspace = Path(workspace).expanduser() if workspace else None
|
||||
overrides = {}
|
||||
if model:
|
||||
overrides["model"] = model
|
||||
if retrieve_model:
|
||||
overrides["retrieve_model"] = retrieve_model
|
||||
opt = ConfigLoader().load(overrides or None)
|
||||
self.model = opt.model
|
||||
self.retrieve_model = opt.retrieve_model or self.model
|
||||
if self.workspace:
|
||||
self.workspace.mkdir(parents=True, exist_ok=True)
|
||||
self.documents = {}
|
||||
if self.workspace:
|
||||
self._load_workspace()
|
||||
|
||||
def index(self, file_path: str, mode: str = "auto") -> str:
|
||||
"""Index a document. Returns a document_id."""
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
doc_id = str(uuid.uuid4())
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
is_pdf = ext == '.pdf'
|
||||
is_md = ext in ['.md', '.markdown']
|
||||
|
||||
if mode == "pdf" or (mode == "auto" and is_pdf):
|
||||
print(f"Indexing PDF: {file_path}")
|
||||
result = page_index(
|
||||
doc=file_path,
|
||||
model=self.model,
|
||||
if_add_node_summary='yes',
|
||||
if_add_node_text='yes',
|
||||
if_add_node_id='yes',
|
||||
if_add_doc_description='yes'
|
||||
)
|
||||
self.documents[doc_id] = {
|
||||
'id': doc_id,
|
||||
'path': file_path,
|
||||
'type': 'pdf',
|
||||
'structure': result['structure'],
|
||||
'doc_name': result.get('doc_name', ''),
|
||||
'doc_description': result.get('doc_description', '')
|
||||
}
|
||||
|
||||
elif mode == "md" or (mode == "auto" and is_md):
|
||||
print(f"Indexing Markdown: {file_path}")
|
||||
coro = md_to_tree(
|
||||
md_path=file_path,
|
||||
if_thinning=False,
|
||||
if_add_node_summary='yes',
|
||||
summary_token_threshold=200,
|
||||
model=self.model,
|
||||
if_add_doc_description='yes',
|
||||
if_add_node_text='yes',
|
||||
if_add_node_id='yes'
|
||||
)
|
||||
try:
|
||||
asyncio.get_running_loop()
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
||||
result = pool.submit(asyncio.run, coro).result()
|
||||
except RuntimeError:
|
||||
result = asyncio.run(coro)
|
||||
self.documents[doc_id] = {
|
||||
'id': doc_id,
|
||||
'path': file_path,
|
||||
'type': 'md',
|
||||
'structure': result['structure'],
|
||||
'doc_name': result.get('doc_name', ''),
|
||||
'doc_description': result.get('doc_description', '')
|
||||
}
|
||||
else:
|
||||
raise ValueError(f"Unsupported file format for: {file_path}")
|
||||
|
||||
print(f"Indexing complete. Document ID: {doc_id}")
|
||||
if self.workspace:
|
||||
self._save_doc(doc_id)
|
||||
return doc_id
|
||||
|
||||
def _save_doc(self, doc_id: str):
|
||||
path = self.workspace / f"{doc_id}.json"
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(self.documents[doc_id], f, ensure_ascii=False, indent=2)
|
||||
|
||||
def _load_workspace(self):
|
||||
loaded = 0
|
||||
for path in self.workspace.glob("*.json"):
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
doc = json.load(f)
|
||||
self.documents[path.stem] = doc
|
||||
loaded += 1
|
||||
except (json.JSONDecodeError, OSError) as e:
|
||||
print(f"Warning: skipping corrupt workspace file {path.name}: {e}")
|
||||
if loaded:
|
||||
print(f"Loaded {loaded} document(s) from workspace.")
|
||||
|
||||
def get_document(self, doc_id: str) -> str:
|
||||
"""Return document metadata JSON."""
|
||||
return get_document(self.documents, doc_id)
|
||||
|
||||
def get_document_structure(self, doc_id: str) -> str:
|
||||
"""Return document tree structure JSON (without text fields)."""
|
||||
return get_document_structure(self.documents, doc_id)
|
||||
|
||||
def get_page_content(self, doc_id: str, pages: str) -> str:
|
||||
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
|
||||
return get_page_content(self.documents, doc_id, pages)
|
||||
Loading…
Add table
Add a link
Reference in a new issue