feat: add PageIndex SDK with local/cloud dual-mode support (#207)

2026-05-01 02:56:21 +02:00 · 2026-04-06 22:51:04 +08:00 · 2026-04-06 22:51:04 +08:00 · c7fe93bb56
commit c7fe93bb56
parent f2dcffc0b7
45 changed files with 4225 additions and 274 deletions
--- a/tests/test_agent.py
+++ b/tests/test_agent.py
@ -0,0 +1,14 @@
+from pageindex.agent import AgentRunner, SYSTEM_PROMPT
+from pageindex.backend.protocol import AgentTools
+
+
+def test_agent_runner_init():
+    tools = AgentTools(function_tools=["mock_tool"])
+    runner = AgentRunner(tools=tools, model="gpt-4o")
+    assert runner._model == "gpt-4o"
+
+
+def test_system_prompt_has_tool_instructions():
+    assert "list_documents" in SYSTEM_PROMPT
+    assert "get_document_structure" in SYSTEM_PROMPT
+    assert "get_page_content" in SYSTEM_PROMPT
--- a/tests/test_client.py
+++ b/tests/test_client.py
@ -0,0 +1,51 @@
+# tests/sdk/test_client.py
+import pytest
+from pageindex.client import PageIndexClient, LocalClient, CloudClient
+
+
+def test_local_client_is_pageindex_client(tmp_path):
+    client = LocalClient(model="gpt-4o", storage_path=str(tmp_path / "pi"))
+    assert isinstance(client, PageIndexClient)
+
+
+def test_cloud_client_is_pageindex_client():
+    client = CloudClient(api_key="pi-test")
+    assert isinstance(client, PageIndexClient)
+
+
+def test_collection_default_name(tmp_path):
+    client = LocalClient(model="gpt-4o", storage_path=str(tmp_path / "pi"))
+    col = client.collection()
+    assert col.name == "default"
+
+
+def test_collection_custom_name(tmp_path):
+    client = LocalClient(model="gpt-4o", storage_path=str(tmp_path / "pi"))
+    col = client.collection("papers")
+    assert col.name == "papers"
+
+
+def test_list_collections_empty(tmp_path):
+    client = LocalClient(model="gpt-4o", storage_path=str(tmp_path / "pi"))
+    assert client.list_collections() == []
+
+
+def test_list_collections_after_create(tmp_path):
+    client = LocalClient(model="gpt-4o", storage_path=str(tmp_path / "pi"))
+    client.collection("papers")
+    assert "papers" in client.list_collections()
+
+
+def test_delete_collection(tmp_path):
+    client = LocalClient(model="gpt-4o", storage_path=str(tmp_path / "pi"))
+    client.collection("papers")
+    client.delete_collection("papers")
+    assert "papers" not in client.list_collections()
+
+
+def test_register_parser(tmp_path):
+    client = LocalClient(model="gpt-4o", storage_path=str(tmp_path / "pi"))
+    class FakeParser:
+        def supported_extensions(self): return [".txt"]
+        def parse(self, file_path, **kwargs): pass
+    client.register_parser(FakeParser())
--- a/tests/test_cloud_backend.py
+++ b/tests/test_cloud_backend.py
@ -0,0 +1,16 @@
+from pageindex.backend.cloud import CloudBackend, API_BASE
+
+
+def test_cloud_backend_init():
+    backend = CloudBackend(api_key="pi-test")
+    assert backend._api_key == "pi-test"
+    assert backend._headers["api_key"] == "pi-test"
+
+
+def test_api_base_url():
+    assert "pageindex.ai" in API_BASE
+
+
+def test_get_retrieve_model_is_none():
+    backend = CloudBackend(api_key="pi-test")
+    assert backend.get_agent_tools("col").function_tools == []
--- a/tests/test_collection.py
+++ b/tests/test_collection.py
@ -0,0 +1,41 @@
+# tests/sdk/test_collection.py
+import pytest
+from unittest.mock import MagicMock
+from pageindex.collection import Collection
+
+
+@pytest.fixture
+def col():
+    backend = MagicMock()
+    backend.list_documents.return_value = [
+        {"doc_id": "d1", "doc_name": "paper.pdf", "doc_type": "pdf"}
+    ]
+    backend.get_document.return_value = {"doc_id": "d1", "doc_name": "paper.pdf"}
+    backend.add_document.return_value = "d1"
+    return Collection(name="papers", backend=backend)
+
+
+def test_add(col):
+    doc_id = col.add("paper.pdf")
+    assert doc_id == "d1"
+    col._backend.add_document.assert_called_once_with("papers", "paper.pdf")
+
+
+def test_list_documents(col):
+    docs = col.list_documents()
+    assert len(docs) == 1
+    assert docs[0]["doc_id"] == "d1"
+
+
+def test_get_document(col):
+    doc = col.get_document("d1")
+    assert doc["doc_name"] == "paper.pdf"
+
+
+def test_delete_document(col):
+    col.delete_document("d1")
+    col._backend.delete_document.assert_called_once_with("papers", "d1")
+
+
+def test_name_property(col):
+    assert col.name == "papers"
--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -0,0 +1,28 @@
+# tests/test_config.py
+import pytest
+from pageindex.config import IndexConfig
+
+
+def test_defaults():
+    config = IndexConfig()
+    assert config.model == "gpt-4o-2024-11-20"
+    assert config.retrieve_model is None
+    assert config.toc_check_page_num == 20
+
+
+def test_overrides():
+    config = IndexConfig(model="gpt-5.4", retrieve_model="claude-sonnet")
+    assert config.model == "gpt-5.4"
+    assert config.retrieve_model == "claude-sonnet"
+
+
+def test_unknown_key_raises():
+    with pytest.raises(Exception):
+        IndexConfig(nonexistent_key="value")
+
+
+def test_model_copy_with_update():
+    config = IndexConfig(toc_check_page_num=30)
+    updated = config.model_copy(update={"model": "gpt-5.4"})
+    assert updated.model == "gpt-5.4"
+    assert updated.toc_check_page_num == 30
--- a/tests/test_content_node.py
+++ b/tests/test_content_node.py
@ -0,0 +1,45 @@
+from pageindex.parser.protocol import ContentNode, ParsedDocument, DocumentParser
+
+
+def test_content_node_required_fields():
+    node = ContentNode(content="hello", tokens=5)
+    assert node.content == "hello"
+    assert node.tokens == 5
+    assert node.title is None
+    assert node.index is None
+    assert node.level is None
+
+
+def test_content_node_all_fields():
+    node = ContentNode(content="# Intro", tokens=10, title="Intro", index=1, level=1)
+    assert node.title == "Intro"
+    assert node.index == 1
+    assert node.level == 1
+
+
+def test_parsed_document():
+    nodes = [ContentNode(content="page1", tokens=100, index=1)]
+    doc = ParsedDocument(doc_name="test.pdf", nodes=nodes)
+    assert doc.doc_name == "test.pdf"
+    assert len(doc.nodes) == 1
+    assert doc.metadata is None
+
+
+def test_parsed_document_with_metadata():
+    nodes = [ContentNode(content="page1", tokens=100)]
+    doc = ParsedDocument(doc_name="test.pdf", nodes=nodes, metadata={"author": "John"})
+    assert doc.metadata["author"] == "John"
+
+
+def test_document_parser_protocol():
+    """Verify a class implementing DocumentParser is structurally compatible."""
+    class MyParser:
+        def supported_extensions(self) -> list[str]:
+            return [".txt"]
+        def parse(self, file_path: str, **kwargs) -> ParsedDocument:
+            return ParsedDocument(doc_name="test", nodes=[])
+
+    parser = MyParser()
+    assert parser.supported_extensions() == [".txt"]
+    result = parser.parse("test.txt")
+    assert isinstance(result, ParsedDocument)
--- a/tests/test_errors.py
+++ b/tests/test_errors.py
@ -0,0 +1,27 @@
+from pageindex.errors import (
+    PageIndexError,
+    CollectionNotFoundError,
+    DocumentNotFoundError,
+    IndexingError,
+    CloudAPIError,
+    FileTypeError,
+)
+
+
+def test_all_errors_inherit_from_base():
+    for cls in [CollectionNotFoundError, DocumentNotFoundError, IndexingError, CloudAPIError, FileTypeError]:
+        assert issubclass(cls, PageIndexError)
+        assert issubclass(cls, Exception)
+
+
+def test_error_message():
+    err = FileTypeError("Unsupported: .docx")
+    assert str(err) == "Unsupported: .docx"
+
+
+def test_catch_base_catches_all():
+    for cls in [CollectionNotFoundError, DocumentNotFoundError, IndexingError, CloudAPIError, FileTypeError]:
+        try:
+            raise cls("test")
+        except PageIndexError:
+            pass  # expected
--- a/tests/test_events.py
+++ b/tests/test_events.py
@ -0,0 +1,26 @@
+from pageindex.events import QueryEvent
+from pageindex.backend.protocol import AgentTools
+
+
+def test_query_event():
+    event = QueryEvent(type="answer_delta", data="hello")
+    assert event.type == "answer_delta"
+    assert event.data == "hello"
+
+
+def test_query_event_types():
+    for t in ["reasoning", "tool_call", "tool_result", "answer_delta", "answer_done"]:
+        event = QueryEvent(type=t, data="test")
+        assert event.type == t
+
+
+def test_agent_tools_default_empty():
+    tools = AgentTools()
+    assert tools.function_tools == []
+    assert tools.mcp_servers == []
+
+
+def test_agent_tools_with_values():
+    tools = AgentTools(function_tools=["tool1"], mcp_servers=["server1"])
+    assert len(tools.function_tools) == 1
+    assert len(tools.mcp_servers) == 1
--- a/tests/test_local_backend.py
+++ b/tests/test_local_backend.py
@ -0,0 +1,50 @@
+# tests/sdk/test_local_backend.py
+import pytest
+from pathlib import Path
+from pageindex.backend.local import LocalBackend
+from pageindex.storage.sqlite import SQLiteStorage
+from pageindex.errors import FileTypeError
+
+
+@pytest.fixture
+def backend(tmp_path):
+    storage = SQLiteStorage(str(tmp_path / "test.db"))
+    files_dir = tmp_path / "files"
+    return LocalBackend(storage=storage, files_dir=str(files_dir), model="gpt-4o")
+
+
+def test_collection_lifecycle(backend):
+    backend.get_or_create_collection("papers")
+    assert "papers" in backend.list_collections()
+    backend.delete_collection("papers")
+    assert "papers" not in backend.list_collections()
+
+
+def test_list_documents_empty(backend):
+    backend.get_or_create_collection("papers")
+    assert backend.list_documents("papers") == []
+
+
+def test_unsupported_file_type_raises(backend, tmp_path):
+    backend.get_or_create_collection("papers")
+    bad_file = tmp_path / "test.xyz"
+    bad_file.write_text("hello")
+    with pytest.raises(FileTypeError):
+        backend.add_document("papers", str(bad_file))
+
+
+def test_register_custom_parser(backend):
+    from pageindex.parser.protocol import ParsedDocument, ContentNode
+
+    class TxtParser:
+        def supported_extensions(self):
+            return [".txt"]
+        def parse(self, file_path, **kwargs):
+            text = Path(file_path).read_text()
+            return ParsedDocument(doc_name="test", nodes=[
+                ContentNode(content=text, tokens=len(text.split()), title="Content", index=1, level=1)
+            ])
+
+    backend.register_parser(TxtParser())
+    # Now .txt should be supported (won't raise FileTypeError)
+    assert backend._resolve_parser("test.txt") is not None
--- a/tests/test_markdown_parser.py
+++ b/tests/test_markdown_parser.py
@ -0,0 +1,55 @@
+import pytest
+from pathlib import Path
+from pageindex.parser.markdown import MarkdownParser
+from pageindex.parser.protocol import ContentNode, ParsedDocument
+
+@pytest.fixture
+def sample_md(tmp_path):
+    md = tmp_path / "test.md"
+    md.write_text("""# Chapter 1
+Some intro text.
+
+## Section 1.1
+Details here.
+
+## Section 1.2
+More details.
+
+# Chapter 2
+Another chapter.
+""")
+    return str(md)
+
+def test_supported_extensions():
+    parser = MarkdownParser()
+    exts = parser.supported_extensions()
+    assert ".md" in exts
+    assert ".markdown" in exts
+
+def test_parse_returns_parsed_document(sample_md):
+    parser = MarkdownParser()
+    result = parser.parse(sample_md)
+    assert isinstance(result, ParsedDocument)
+    assert result.doc_name == "test"
+
+def test_parse_nodes_have_level(sample_md):
+    parser = MarkdownParser()
+    result = parser.parse(sample_md)
+    assert len(result.nodes) == 4
+    assert result.nodes[0].level == 1
+    assert result.nodes[0].title == "Chapter 1"
+    assert result.nodes[1].level == 2
+    assert result.nodes[1].title == "Section 1.1"
+    assert result.nodes[3].level == 1
+
+def test_parse_nodes_have_content(sample_md):
+    parser = MarkdownParser()
+    result = parser.parse(sample_md)
+    assert "Some intro text" in result.nodes[0].content
+    assert "Details here" in result.nodes[1].content
+
+def test_parse_nodes_have_index(sample_md):
+    parser = MarkdownParser()
+    result = parser.parse(sample_md)
+    for node in result.nodes:
+        assert node.index is not None
--- a/tests/test_pdf_parser.py
+++ b/tests/test_pdf_parser.py
@ -0,0 +1,29 @@
+import pytest
+from pathlib import Path
+from pageindex.parser.pdf import PdfParser
+from pageindex.parser.protocol import ContentNode, ParsedDocument
+
+TEST_PDF = Path("tests/pdfs/deepseek-r1.pdf")
+
+def test_supported_extensions():
+    parser = PdfParser()
+    assert ".pdf" in parser.supported_extensions()
+
+@pytest.mark.skipif(not TEST_PDF.exists(), reason="Test PDF not available")
+def test_parse_returns_parsed_document():
+    parser = PdfParser()
+    result = parser.parse(str(TEST_PDF))
+    assert isinstance(result, ParsedDocument)
+    assert len(result.nodes) > 0
+    assert result.doc_name != ""
+
+@pytest.mark.skipif(not TEST_PDF.exists(), reason="Test PDF not available")
+def test_parse_nodes_are_flat_without_level():
+    parser = PdfParser()
+    result = parser.parse(str(TEST_PDF))
+    for node in result.nodes:
+        assert isinstance(node, ContentNode)
+        assert node.content is not None
+        assert node.tokens >= 0
+        assert node.index is not None
+        assert node.level is None
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@ -0,0 +1,95 @@
+# tests/sdk/test_pipeline.py
+import asyncio
+from unittest.mock import patch, AsyncMock
+
+from pageindex.parser.protocol import ContentNode, ParsedDocument
+from pageindex.index.pipeline import (
+    detect_strategy, build_tree_from_levels, build_index,
+    _content_based_pipeline, _NullLogger,
+)
+
+
+def test_detect_strategy_with_level():
+    nodes = [
+        ContentNode(content="# Intro", tokens=10, title="Intro", index=1, level=1),
+        ContentNode(content="## Details", tokens=10, title="Details", index=5, level=2),
+    ]
+    assert detect_strategy(nodes) == "level_based"
+
+
+def test_detect_strategy_without_level():
+    nodes = [
+        ContentNode(content="Page 1 text", tokens=100, index=1),
+        ContentNode(content="Page 2 text", tokens=100, index=2),
+    ]
+    assert detect_strategy(nodes) == "content_based"
+
+
+def test_build_tree_from_levels():
+    nodes = [
+        ContentNode(content="ch1 text", tokens=10, title="Chapter 1", index=1, level=1),
+        ContentNode(content="s1.1 text", tokens=10, title="Section 1.1", index=5, level=2),
+        ContentNode(content="s1.2 text", tokens=10, title="Section 1.2", index=10, level=2),
+        ContentNode(content="ch2 text", tokens=10, title="Chapter 2", index=20, level=1),
+    ]
+    tree = build_tree_from_levels(nodes)
+    assert len(tree) == 2  # 2 root nodes (chapters)
+    assert tree[0]["title"] == "Chapter 1"
+    assert len(tree[0]["nodes"]) == 2  # 2 sections under chapter 1
+    assert tree[0]["nodes"][0]["title"] == "Section 1.1"
+    assert tree[0]["nodes"][1]["title"] == "Section 1.2"
+    assert tree[1]["title"] == "Chapter 2"
+    assert len(tree[1]["nodes"]) == 0
+
+
+def test_build_tree_from_levels_single_level():
+    nodes = [
+        ContentNode(content="a", tokens=5, title="A", index=1, level=1),
+        ContentNode(content="b", tokens=5, title="B", index=2, level=1),
+    ]
+    tree = build_tree_from_levels(nodes)
+    assert len(tree) == 2
+    assert tree[0]["title"] == "A"
+    assert tree[1]["title"] == "B"
+
+
+def test_build_tree_from_levels_deep_nesting():
+    nodes = [
+        ContentNode(content="h1", tokens=5, title="H1", index=1, level=1),
+        ContentNode(content="h2", tokens=5, title="H2", index=2, level=2),
+        ContentNode(content="h3", tokens=5, title="H3", index=3, level=3),
+    ]
+    tree = build_tree_from_levels(nodes)
+    assert len(tree) == 1
+    assert tree[0]["title"] == "H1"
+    assert len(tree[0]["nodes"]) == 1
+    assert tree[0]["nodes"][0]["title"] == "H2"
+    assert len(tree[0]["nodes"][0]["nodes"]) == 1
+    assert tree[0]["nodes"][0]["nodes"][0]["title"] == "H3"
+
+
+def test_content_based_pipeline_does_not_raise():
+    """_content_based_pipeline should delegate to tree_parser, not raise NotImplementedError."""
+    fake_tree = [{"title": "Intro", "start_index": 1, "end_index": 2, "nodes": []}]
+
+    async def fake_tree_parser(page_list, opt, doc=None, logger=None):
+        return fake_tree
+
+    page_list = [("Page 1 text", 50), ("Page 2 text", 60)]
+
+    from types import SimpleNamespace
+    opt = SimpleNamespace(model="test-model")
+
+    with patch("pageindex.index.page_index.tree_parser", new=fake_tree_parser):
+        result = asyncio.run(_content_based_pipeline(page_list, opt))
+
+    assert result == fake_tree
+
+
+def test_null_logger_methods():
+    """NullLogger should have info/error/debug and not raise."""
+    logger = _NullLogger()
+    logger.info("test message")
+    logger.error("test error")
+    logger.debug("test debug")
+    logger.info({"key": "value"})
--- a/tests/test_sqlite_storage.py
+++ b/tests/test_sqlite_storage.py
@ -0,0 +1,61 @@
+import pytest
+from pageindex.storage.sqlite import SQLiteStorage
+
+@pytest.fixture
+def storage(tmp_path):
+    return SQLiteStorage(str(tmp_path / "test.db"))
+
+def test_create_and_list_collections(storage):
+    storage.create_collection("papers")
+    assert "papers" in storage.list_collections()
+
+def test_get_or_create_collection_idempotent(storage):
+    storage.get_or_create_collection("papers")
+    storage.get_or_create_collection("papers")
+    assert storage.list_collections().count("papers") == 1
+
+def test_delete_collection(storage):
+    storage.create_collection("papers")
+    storage.delete_collection("papers")
+    assert "papers" not in storage.list_collections()
+
+def test_save_and_get_document(storage):
+    storage.create_collection("papers")
+    doc = {
+        "doc_name": "test.pdf", "doc_description": "A test",
+        "file_path": "/tmp/test.pdf", "doc_type": "pdf",
+        "structure": [{"title": "Intro", "node_id": "0001"}],
+    }
+    storage.save_document("papers", "doc-1", doc)
+    result = storage.get_document("papers", "doc-1")
+    assert result["doc_name"] == "test.pdf"
+    assert result["doc_type"] == "pdf"
+
+def test_get_document_structure(storage):
+    storage.create_collection("papers")
+    structure = [{"title": "Ch1", "node_id": "0001", "nodes": []}]
+    storage.save_document("papers", "doc-1", {
+        "doc_name": "test.pdf", "doc_type": "pdf",
+        "file_path": "/tmp/test.pdf", "structure": structure,
+    })
+    result = storage.get_document_structure("papers", "doc-1")
+    assert result[0]["title"] == "Ch1"
+
+def test_list_documents(storage):
+    storage.create_collection("papers")
+    storage.save_document("papers", "doc-1", {"doc_name": "p1.pdf", "doc_type": "pdf", "file_path": "/tmp/p1.pdf", "structure": []})
+    storage.save_document("papers", "doc-2", {"doc_name": "p2.pdf", "doc_type": "pdf", "file_path": "/tmp/p2.pdf", "structure": []})
+    docs = storage.list_documents("papers")
+    assert len(docs) == 2
+
+def test_delete_document(storage):
+    storage.create_collection("papers")
+    storage.save_document("papers", "doc-1", {"doc_name": "test.pdf", "doc_type": "pdf", "file_path": "/tmp/test.pdf", "structure": []})
+    storage.delete_document("papers", "doc-1")
+    assert len(storage.list_documents("papers")) == 0
+
+def test_delete_collection_cascades_documents(storage):
+    storage.create_collection("papers")
+    storage.save_document("papers", "doc-1", {"doc_name": "test.pdf", "doc_type": "pdf", "file_path": "/tmp/test.pdf", "structure": []})
+    storage.delete_collection("papers")
+    assert "papers" not in storage.list_collections()
--- a/tests/test_storage_protocol.py
+++ b/tests/test_storage_protocol.py
@ -0,0 +1,19 @@
+from pageindex.storage.protocol import StorageEngine
+
+def test_storage_engine_is_protocol():
+    class FakeStorage:
+        def create_collection(self, name: str) -> None: pass
+        def get_or_create_collection(self, name: str) -> None: pass
+        def list_collections(self) -> list[str]: return []
+        def delete_collection(self, name: str) -> None: pass
+        def save_document(self, collection: str, doc_id: str, doc: dict) -> None: pass
+        def find_document_by_hash(self, collection: str, file_hash: str) -> str | None: return None
+        def get_document(self, collection: str, doc_id: str) -> dict: return {}
+        def get_document_structure(self, collection: str, doc_id: str) -> dict: return {}
+        def get_pages(self, collection: str, doc_id: str) -> list | None: return None
+        def list_documents(self, collection: str) -> list[dict]: return []
+        def delete_document(self, collection: str, doc_id: str) -> None: pass
+        def close(self) -> None: pass
+
+    storage = FakeStorage()
+    assert isinstance(storage, StorageEngine)