Add PageIndexClient with agent-based retrieval via OpenAI Agents SDK (#125)

* Add PageIndexClient with retrieve, streaming support and litellm integration * Add OpenAI agents demo example * Update README with example agent demo section * Support separate retrieve_model configuration for index and retrieve
2026-05-12 00:02:36 +02:00 · 2026-03-26 23:19:50 +08:00 · 2026-03-26 23:19:50 +08:00 · 5d4491f3bf
commit 5d4491f3bf
parent 2403be8f27
9 changed files with 501 additions and 7 deletions
--- a/pageindex/retrieve.py
+++ b/pageindex/retrieve.py
@ -0,0 +1,139 @@
+import json
+import PyPDF2
+
+try:
+    from .utils import get_number_of_pages, remove_fields
+except ImportError:
+    from utils import get_number_of_pages, remove_fields
+
+
+# ── Helpers ──────────────────────────────────────────────────────────────────
+
+def _parse_pages(pages: str) -> list[int]:
+    """Parse a pages string like '5-7', '3,8', or '12' into a sorted list of ints."""
+    result = []
+    for part in pages.split(','):
+        part = part.strip()
+        if '-' in part:
+            start, end = int(part.split('-', 1)[0].strip()), int(part.split('-', 1)[1].strip())
+            if start > end:
+                raise ValueError(f"Invalid range '{part}': start must be <= end")
+            result.extend(range(start, end + 1))
+        else:
+            result.append(int(part))
+    return sorted(set(result))
+
+
+def _count_pages(doc_info: dict) -> int:
+    """Return total page count for a document."""
+    if doc_info.get('type') == 'pdf':
+        return get_number_of_pages(doc_info['path'])
+    # For MD, find max line_num across all nodes
+    max_line = 0
+    def _traverse(nodes):
+        nonlocal max_line
+        for node in nodes:
+            ln = node.get('line_num', 0)
+            if ln and ln > max_line:
+                max_line = ln
+            if node.get('nodes'):
+                _traverse(node['nodes'])
+    _traverse(doc_info.get('structure', []))
+    return max_line
+
+
+def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
+    """Extract text for specific PDF pages (1-indexed), opening the PDF once."""
+    path = doc_info['path']
+    with open(path, 'rb') as f:
+        pdf_reader = PyPDF2.PdfReader(f)
+        total = len(pdf_reader.pages)
+        valid_pages = [p for p in page_nums if 1 <= p <= total]
+        return [
+            {'page': p, 'content': pdf_reader.pages[p - 1].extract_text() or ''}
+            for p in valid_pages
+        ]
+
+
+def _get_md_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
+    """
+    For Markdown documents, 'pages' are line numbers.
+    Find nodes whose line_num falls within [min(page_nums), max(page_nums)] and return their text.
+    """
+    min_line, max_line = min(page_nums), max(page_nums)
+    results = []
+    seen = set()
+
+    def _traverse(nodes):
+        for node in nodes:
+            ln = node.get('line_num')
+            if ln and min_line <= ln <= max_line and ln not in seen:
+                seen.add(ln)
+                results.append({'page': ln, 'content': node.get('text', '')})
+            if node.get('nodes'):
+                _traverse(node['nodes'])
+
+    _traverse(doc_info.get('structure', []))
+    results.sort(key=lambda x: x['page'])
+    return results
+
+
+# ── Tool functions ────────────────────────────────────────────────────────────
+
+def get_document(documents: dict, doc_id: str) -> str:
+    """Return JSON with document metadata: doc_id, doc_name, doc_description, type, status, page_count (PDF) or line_count (Markdown)."""
+    doc_info = documents.get(doc_id)
+    if not doc_info:
+        return json.dumps({'error': f'Document {doc_id} not found'})
+    result = {
+        'doc_id': doc_id,
+        'doc_name': doc_info.get('doc_name', ''),
+        'doc_description': doc_info.get('doc_description', ''),
+        'type': doc_info.get('type', ''),
+        'status': 'completed',
+    }
+    if doc_info.get('type') == 'pdf':
+        result['page_count'] = _count_pages(doc_info)
+    else:
+        result['line_count'] = _count_pages(doc_info)
+    return json.dumps(result)
+
+
+def get_document_structure(documents: dict, doc_id: str) -> str:
+    """Return tree structure JSON with text fields removed (saves tokens)."""
+    doc_info = documents.get(doc_id)
+    if not doc_info:
+        return json.dumps({'error': f'Document {doc_id} not found'})
+    structure = doc_info.get('structure', [])
+    structure_no_text = remove_fields(structure, fields=['text'])
+    return json.dumps(structure_no_text, ensure_ascii=False)
+
+
+def get_page_content(documents: dict, doc_id: str, pages: str) -> str:
+    """
+    Retrieve page content for a document.
+
+    pages format: '5-7', '3,8', or '12'
+    For PDF: pages are physical page numbers (1-indexed).
+    For Markdown: pages are line numbers corresponding to node headers.
+
+    Returns JSON list of {'page': int, 'content': str}.
+    """
+    doc_info = documents.get(doc_id)
+    if not doc_info:
+        return json.dumps({'error': f'Document {doc_id} not found'})
+
+    try:
+        page_nums = _parse_pages(pages)
+    except (ValueError, AttributeError) as e:
+        return json.dumps({'error': f'Invalid pages format: {pages!r}. Use "5-7", "3,8", or "12". Error: {e}'})
+
+    try:
+        if doc_info.get('type') == 'pdf':
+            content = _get_pdf_page_content(doc_info, page_nums)
+        else:
+            content = _get_md_page_content(doc_info, page_nums)
+    except Exception as e:
+        return json.dumps({'error': f'Failed to read page content: {e}'})
+
+    return json.dumps(content, ensure_ascii=False)