feat: add PageIndex SDK with local/cloud dual-mode support (#207)

This commit is contained in:
Kylin 2026-04-06 22:51:04 +08:00 committed by Ray
parent f2dcffc0b7
commit c7fe93bb56
45 changed files with 4225 additions and 274 deletions

14
tests/test_agent.py Normal file
View file

@ -0,0 +1,14 @@
from pageindex.agent import AgentRunner, SYSTEM_PROMPT
from pageindex.backend.protocol import AgentTools
def test_agent_runner_init():
tools = AgentTools(function_tools=["mock_tool"])
runner = AgentRunner(tools=tools, model="gpt-4o")
assert runner._model == "gpt-4o"
def test_system_prompt_has_tool_instructions():
assert "list_documents" in SYSTEM_PROMPT
assert "get_document_structure" in SYSTEM_PROMPT
assert "get_page_content" in SYSTEM_PROMPT

51
tests/test_client.py Normal file
View file

@ -0,0 +1,51 @@
# tests/sdk/test_client.py
import pytest
from pageindex.client import PageIndexClient, LocalClient, CloudClient
def test_local_client_is_pageindex_client(tmp_path):
client = LocalClient(model="gpt-4o", storage_path=str(tmp_path / "pi"))
assert isinstance(client, PageIndexClient)
def test_cloud_client_is_pageindex_client():
client = CloudClient(api_key="pi-test")
assert isinstance(client, PageIndexClient)
def test_collection_default_name(tmp_path):
client = LocalClient(model="gpt-4o", storage_path=str(tmp_path / "pi"))
col = client.collection()
assert col.name == "default"
def test_collection_custom_name(tmp_path):
client = LocalClient(model="gpt-4o", storage_path=str(tmp_path / "pi"))
col = client.collection("papers")
assert col.name == "papers"
def test_list_collections_empty(tmp_path):
client = LocalClient(model="gpt-4o", storage_path=str(tmp_path / "pi"))
assert client.list_collections() == []
def test_list_collections_after_create(tmp_path):
client = LocalClient(model="gpt-4o", storage_path=str(tmp_path / "pi"))
client.collection("papers")
assert "papers" in client.list_collections()
def test_delete_collection(tmp_path):
client = LocalClient(model="gpt-4o", storage_path=str(tmp_path / "pi"))
client.collection("papers")
client.delete_collection("papers")
assert "papers" not in client.list_collections()
def test_register_parser(tmp_path):
client = LocalClient(model="gpt-4o", storage_path=str(tmp_path / "pi"))
class FakeParser:
def supported_extensions(self): return [".txt"]
def parse(self, file_path, **kwargs): pass
client.register_parser(FakeParser())

View file

@ -0,0 +1,16 @@
from pageindex.backend.cloud import CloudBackend, API_BASE
def test_cloud_backend_init():
backend = CloudBackend(api_key="pi-test")
assert backend._api_key == "pi-test"
assert backend._headers["api_key"] == "pi-test"
def test_api_base_url():
assert "pageindex.ai" in API_BASE
def test_get_retrieve_model_is_none():
backend = CloudBackend(api_key="pi-test")
assert backend.get_agent_tools("col").function_tools == []

41
tests/test_collection.py Normal file
View file

@ -0,0 +1,41 @@
# tests/sdk/test_collection.py
import pytest
from unittest.mock import MagicMock
from pageindex.collection import Collection
@pytest.fixture
def col():
backend = MagicMock()
backend.list_documents.return_value = [
{"doc_id": "d1", "doc_name": "paper.pdf", "doc_type": "pdf"}
]
backend.get_document.return_value = {"doc_id": "d1", "doc_name": "paper.pdf"}
backend.add_document.return_value = "d1"
return Collection(name="papers", backend=backend)
def test_add(col):
doc_id = col.add("paper.pdf")
assert doc_id == "d1"
col._backend.add_document.assert_called_once_with("papers", "paper.pdf")
def test_list_documents(col):
docs = col.list_documents()
assert len(docs) == 1
assert docs[0]["doc_id"] == "d1"
def test_get_document(col):
doc = col.get_document("d1")
assert doc["doc_name"] == "paper.pdf"
def test_delete_document(col):
col.delete_document("d1")
col._backend.delete_document.assert_called_once_with("papers", "d1")
def test_name_property(col):
assert col.name == "papers"

28
tests/test_config.py Normal file
View file

@ -0,0 +1,28 @@
# tests/test_config.py
import pytest
from pageindex.config import IndexConfig
def test_defaults():
config = IndexConfig()
assert config.model == "gpt-4o-2024-11-20"
assert config.retrieve_model is None
assert config.toc_check_page_num == 20
def test_overrides():
config = IndexConfig(model="gpt-5.4", retrieve_model="claude-sonnet")
assert config.model == "gpt-5.4"
assert config.retrieve_model == "claude-sonnet"
def test_unknown_key_raises():
with pytest.raises(Exception):
IndexConfig(nonexistent_key="value")
def test_model_copy_with_update():
config = IndexConfig(toc_check_page_num=30)
updated = config.model_copy(update={"model": "gpt-5.4"})
assert updated.model == "gpt-5.4"
assert updated.toc_check_page_num == 30

View file

@ -0,0 +1,45 @@
from pageindex.parser.protocol import ContentNode, ParsedDocument, DocumentParser
def test_content_node_required_fields():
node = ContentNode(content="hello", tokens=5)
assert node.content == "hello"
assert node.tokens == 5
assert node.title is None
assert node.index is None
assert node.level is None
def test_content_node_all_fields():
node = ContentNode(content="# Intro", tokens=10, title="Intro", index=1, level=1)
assert node.title == "Intro"
assert node.index == 1
assert node.level == 1
def test_parsed_document():
nodes = [ContentNode(content="page1", tokens=100, index=1)]
doc = ParsedDocument(doc_name="test.pdf", nodes=nodes)
assert doc.doc_name == "test.pdf"
assert len(doc.nodes) == 1
assert doc.metadata is None
def test_parsed_document_with_metadata():
nodes = [ContentNode(content="page1", tokens=100)]
doc = ParsedDocument(doc_name="test.pdf", nodes=nodes, metadata={"author": "John"})
assert doc.metadata["author"] == "John"
def test_document_parser_protocol():
"""Verify a class implementing DocumentParser is structurally compatible."""
class MyParser:
def supported_extensions(self) -> list[str]:
return [".txt"]
def parse(self, file_path: str, **kwargs) -> ParsedDocument:
return ParsedDocument(doc_name="test", nodes=[])
parser = MyParser()
assert parser.supported_extensions() == [".txt"]
result = parser.parse("test.txt")
assert isinstance(result, ParsedDocument)

27
tests/test_errors.py Normal file
View file

@ -0,0 +1,27 @@
from pageindex.errors import (
PageIndexError,
CollectionNotFoundError,
DocumentNotFoundError,
IndexingError,
CloudAPIError,
FileTypeError,
)
def test_all_errors_inherit_from_base():
for cls in [CollectionNotFoundError, DocumentNotFoundError, IndexingError, CloudAPIError, FileTypeError]:
assert issubclass(cls, PageIndexError)
assert issubclass(cls, Exception)
def test_error_message():
err = FileTypeError("Unsupported: .docx")
assert str(err) == "Unsupported: .docx"
def test_catch_base_catches_all():
for cls in [CollectionNotFoundError, DocumentNotFoundError, IndexingError, CloudAPIError, FileTypeError]:
try:
raise cls("test")
except PageIndexError:
pass # expected

26
tests/test_events.py Normal file
View file

@ -0,0 +1,26 @@
from pageindex.events import QueryEvent
from pageindex.backend.protocol import AgentTools
def test_query_event():
event = QueryEvent(type="answer_delta", data="hello")
assert event.type == "answer_delta"
assert event.data == "hello"
def test_query_event_types():
for t in ["reasoning", "tool_call", "tool_result", "answer_delta", "answer_done"]:
event = QueryEvent(type=t, data="test")
assert event.type == t
def test_agent_tools_default_empty():
tools = AgentTools()
assert tools.function_tools == []
assert tools.mcp_servers == []
def test_agent_tools_with_values():
tools = AgentTools(function_tools=["tool1"], mcp_servers=["server1"])
assert len(tools.function_tools) == 1
assert len(tools.mcp_servers) == 1

View file

@ -0,0 +1,50 @@
# tests/sdk/test_local_backend.py
import pytest
from pathlib import Path
from pageindex.backend.local import LocalBackend
from pageindex.storage.sqlite import SQLiteStorage
from pageindex.errors import FileTypeError
@pytest.fixture
def backend(tmp_path):
storage = SQLiteStorage(str(tmp_path / "test.db"))
files_dir = tmp_path / "files"
return LocalBackend(storage=storage, files_dir=str(files_dir), model="gpt-4o")
def test_collection_lifecycle(backend):
backend.get_or_create_collection("papers")
assert "papers" in backend.list_collections()
backend.delete_collection("papers")
assert "papers" not in backend.list_collections()
def test_list_documents_empty(backend):
backend.get_or_create_collection("papers")
assert backend.list_documents("papers") == []
def test_unsupported_file_type_raises(backend, tmp_path):
backend.get_or_create_collection("papers")
bad_file = tmp_path / "test.xyz"
bad_file.write_text("hello")
with pytest.raises(FileTypeError):
backend.add_document("papers", str(bad_file))
def test_register_custom_parser(backend):
from pageindex.parser.protocol import ParsedDocument, ContentNode
class TxtParser:
def supported_extensions(self):
return [".txt"]
def parse(self, file_path, **kwargs):
text = Path(file_path).read_text()
return ParsedDocument(doc_name="test", nodes=[
ContentNode(content=text, tokens=len(text.split()), title="Content", index=1, level=1)
])
backend.register_parser(TxtParser())
# Now .txt should be supported (won't raise FileTypeError)
assert backend._resolve_parser("test.txt") is not None

View file

@ -0,0 +1,55 @@
import pytest
from pathlib import Path
from pageindex.parser.markdown import MarkdownParser
from pageindex.parser.protocol import ContentNode, ParsedDocument
@pytest.fixture
def sample_md(tmp_path):
md = tmp_path / "test.md"
md.write_text("""# Chapter 1
Some intro text.
## Section 1.1
Details here.
## Section 1.2
More details.
# Chapter 2
Another chapter.
""")
return str(md)
def test_supported_extensions():
parser = MarkdownParser()
exts = parser.supported_extensions()
assert ".md" in exts
assert ".markdown" in exts
def test_parse_returns_parsed_document(sample_md):
parser = MarkdownParser()
result = parser.parse(sample_md)
assert isinstance(result, ParsedDocument)
assert result.doc_name == "test"
def test_parse_nodes_have_level(sample_md):
parser = MarkdownParser()
result = parser.parse(sample_md)
assert len(result.nodes) == 4
assert result.nodes[0].level == 1
assert result.nodes[0].title == "Chapter 1"
assert result.nodes[1].level == 2
assert result.nodes[1].title == "Section 1.1"
assert result.nodes[3].level == 1
def test_parse_nodes_have_content(sample_md):
parser = MarkdownParser()
result = parser.parse(sample_md)
assert "Some intro text" in result.nodes[0].content
assert "Details here" in result.nodes[1].content
def test_parse_nodes_have_index(sample_md):
parser = MarkdownParser()
result = parser.parse(sample_md)
for node in result.nodes:
assert node.index is not None

29
tests/test_pdf_parser.py Normal file
View file

@ -0,0 +1,29 @@
import pytest
from pathlib import Path
from pageindex.parser.pdf import PdfParser
from pageindex.parser.protocol import ContentNode, ParsedDocument
TEST_PDF = Path("tests/pdfs/deepseek-r1.pdf")
def test_supported_extensions():
parser = PdfParser()
assert ".pdf" in parser.supported_extensions()
@pytest.mark.skipif(not TEST_PDF.exists(), reason="Test PDF not available")
def test_parse_returns_parsed_document():
parser = PdfParser()
result = parser.parse(str(TEST_PDF))
assert isinstance(result, ParsedDocument)
assert len(result.nodes) > 0
assert result.doc_name != ""
@pytest.mark.skipif(not TEST_PDF.exists(), reason="Test PDF not available")
def test_parse_nodes_are_flat_without_level():
parser = PdfParser()
result = parser.parse(str(TEST_PDF))
for node in result.nodes:
assert isinstance(node, ContentNode)
assert node.content is not None
assert node.tokens >= 0
assert node.index is not None
assert node.level is None

95
tests/test_pipeline.py Normal file
View file

@ -0,0 +1,95 @@
# tests/sdk/test_pipeline.py
import asyncio
from unittest.mock import patch, AsyncMock
from pageindex.parser.protocol import ContentNode, ParsedDocument
from pageindex.index.pipeline import (
detect_strategy, build_tree_from_levels, build_index,
_content_based_pipeline, _NullLogger,
)
def test_detect_strategy_with_level():
nodes = [
ContentNode(content="# Intro", tokens=10, title="Intro", index=1, level=1),
ContentNode(content="## Details", tokens=10, title="Details", index=5, level=2),
]
assert detect_strategy(nodes) == "level_based"
def test_detect_strategy_without_level():
nodes = [
ContentNode(content="Page 1 text", tokens=100, index=1),
ContentNode(content="Page 2 text", tokens=100, index=2),
]
assert detect_strategy(nodes) == "content_based"
def test_build_tree_from_levels():
nodes = [
ContentNode(content="ch1 text", tokens=10, title="Chapter 1", index=1, level=1),
ContentNode(content="s1.1 text", tokens=10, title="Section 1.1", index=5, level=2),
ContentNode(content="s1.2 text", tokens=10, title="Section 1.2", index=10, level=2),
ContentNode(content="ch2 text", tokens=10, title="Chapter 2", index=20, level=1),
]
tree = build_tree_from_levels(nodes)
assert len(tree) == 2 # 2 root nodes (chapters)
assert tree[0]["title"] == "Chapter 1"
assert len(tree[0]["nodes"]) == 2 # 2 sections under chapter 1
assert tree[0]["nodes"][0]["title"] == "Section 1.1"
assert tree[0]["nodes"][1]["title"] == "Section 1.2"
assert tree[1]["title"] == "Chapter 2"
assert len(tree[1]["nodes"]) == 0
def test_build_tree_from_levels_single_level():
nodes = [
ContentNode(content="a", tokens=5, title="A", index=1, level=1),
ContentNode(content="b", tokens=5, title="B", index=2, level=1),
]
tree = build_tree_from_levels(nodes)
assert len(tree) == 2
assert tree[0]["title"] == "A"
assert tree[1]["title"] == "B"
def test_build_tree_from_levels_deep_nesting():
nodes = [
ContentNode(content="h1", tokens=5, title="H1", index=1, level=1),
ContentNode(content="h2", tokens=5, title="H2", index=2, level=2),
ContentNode(content="h3", tokens=5, title="H3", index=3, level=3),
]
tree = build_tree_from_levels(nodes)
assert len(tree) == 1
assert tree[0]["title"] == "H1"
assert len(tree[0]["nodes"]) == 1
assert tree[0]["nodes"][0]["title"] == "H2"
assert len(tree[0]["nodes"][0]["nodes"]) == 1
assert tree[0]["nodes"][0]["nodes"][0]["title"] == "H3"
def test_content_based_pipeline_does_not_raise():
"""_content_based_pipeline should delegate to tree_parser, not raise NotImplementedError."""
fake_tree = [{"title": "Intro", "start_index": 1, "end_index": 2, "nodes": []}]
async def fake_tree_parser(page_list, opt, doc=None, logger=None):
return fake_tree
page_list = [("Page 1 text", 50), ("Page 2 text", 60)]
from types import SimpleNamespace
opt = SimpleNamespace(model="test-model")
with patch("pageindex.index.page_index.tree_parser", new=fake_tree_parser):
result = asyncio.run(_content_based_pipeline(page_list, opt))
assert result == fake_tree
def test_null_logger_methods():
"""NullLogger should have info/error/debug and not raise."""
logger = _NullLogger()
logger.info("test message")
logger.error("test error")
logger.debug("test debug")
logger.info({"key": "value"})

View file

@ -0,0 +1,61 @@
import pytest
from pageindex.storage.sqlite import SQLiteStorage
@pytest.fixture
def storage(tmp_path):
return SQLiteStorage(str(tmp_path / "test.db"))
def test_create_and_list_collections(storage):
storage.create_collection("papers")
assert "papers" in storage.list_collections()
def test_get_or_create_collection_idempotent(storage):
storage.get_or_create_collection("papers")
storage.get_or_create_collection("papers")
assert storage.list_collections().count("papers") == 1
def test_delete_collection(storage):
storage.create_collection("papers")
storage.delete_collection("papers")
assert "papers" not in storage.list_collections()
def test_save_and_get_document(storage):
storage.create_collection("papers")
doc = {
"doc_name": "test.pdf", "doc_description": "A test",
"file_path": "/tmp/test.pdf", "doc_type": "pdf",
"structure": [{"title": "Intro", "node_id": "0001"}],
}
storage.save_document("papers", "doc-1", doc)
result = storage.get_document("papers", "doc-1")
assert result["doc_name"] == "test.pdf"
assert result["doc_type"] == "pdf"
def test_get_document_structure(storage):
storage.create_collection("papers")
structure = [{"title": "Ch1", "node_id": "0001", "nodes": []}]
storage.save_document("papers", "doc-1", {
"doc_name": "test.pdf", "doc_type": "pdf",
"file_path": "/tmp/test.pdf", "structure": structure,
})
result = storage.get_document_structure("papers", "doc-1")
assert result[0]["title"] == "Ch1"
def test_list_documents(storage):
storage.create_collection("papers")
storage.save_document("papers", "doc-1", {"doc_name": "p1.pdf", "doc_type": "pdf", "file_path": "/tmp/p1.pdf", "structure": []})
storage.save_document("papers", "doc-2", {"doc_name": "p2.pdf", "doc_type": "pdf", "file_path": "/tmp/p2.pdf", "structure": []})
docs = storage.list_documents("papers")
assert len(docs) == 2
def test_delete_document(storage):
storage.create_collection("papers")
storage.save_document("papers", "doc-1", {"doc_name": "test.pdf", "doc_type": "pdf", "file_path": "/tmp/test.pdf", "structure": []})
storage.delete_document("papers", "doc-1")
assert len(storage.list_documents("papers")) == 0
def test_delete_collection_cascades_documents(storage):
storage.create_collection("papers")
storage.save_document("papers", "doc-1", {"doc_name": "test.pdf", "doc_type": "pdf", "file_path": "/tmp/test.pdf", "structure": []})
storage.delete_collection("papers")
assert "papers" not in storage.list_collections()

View file

@ -0,0 +1,19 @@
from pageindex.storage.protocol import StorageEngine
def test_storage_engine_is_protocol():
class FakeStorage:
def create_collection(self, name: str) -> None: pass
def get_or_create_collection(self, name: str) -> None: pass
def list_collections(self) -> list[str]: return []
def delete_collection(self, name: str) -> None: pass
def save_document(self, collection: str, doc_id: str, doc: dict) -> None: pass
def find_document_by_hash(self, collection: str, file_hash: str) -> str | None: return None
def get_document(self, collection: str, doc_id: str) -> dict: return {}
def get_document_structure(self, collection: str, doc_id: str) -> dict: return {}
def get_pages(self, collection: str, doc_id: str) -> list | None: return None
def list_documents(self, collection: str) -> list[dict]: return []
def delete_document(self, collection: str, doc_id: str) -> None: pass
def close(self) -> None: pass
storage = FakeStorage()
assert isinstance(storage, StorageEngine)