mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-04-24 23:56:21 +02:00
95 lines
3.4 KiB
Python
95 lines
3.4 KiB
Python
# tests/sdk/test_pipeline.py
|
|
import asyncio
|
|
from unittest.mock import patch, AsyncMock
|
|
|
|
from pageindex.parser.protocol import ContentNode, ParsedDocument
|
|
from pageindex.index.pipeline import (
|
|
detect_strategy, build_tree_from_levels, build_index,
|
|
_content_based_pipeline, _NullLogger,
|
|
)
|
|
|
|
|
|
def test_detect_strategy_with_level():
|
|
nodes = [
|
|
ContentNode(content="# Intro", tokens=10, title="Intro", index=1, level=1),
|
|
ContentNode(content="## Details", tokens=10, title="Details", index=5, level=2),
|
|
]
|
|
assert detect_strategy(nodes) == "level_based"
|
|
|
|
|
|
def test_detect_strategy_without_level():
|
|
nodes = [
|
|
ContentNode(content="Page 1 text", tokens=100, index=1),
|
|
ContentNode(content="Page 2 text", tokens=100, index=2),
|
|
]
|
|
assert detect_strategy(nodes) == "content_based"
|
|
|
|
|
|
def test_build_tree_from_levels():
|
|
nodes = [
|
|
ContentNode(content="ch1 text", tokens=10, title="Chapter 1", index=1, level=1),
|
|
ContentNode(content="s1.1 text", tokens=10, title="Section 1.1", index=5, level=2),
|
|
ContentNode(content="s1.2 text", tokens=10, title="Section 1.2", index=10, level=2),
|
|
ContentNode(content="ch2 text", tokens=10, title="Chapter 2", index=20, level=1),
|
|
]
|
|
tree = build_tree_from_levels(nodes)
|
|
assert len(tree) == 2 # 2 root nodes (chapters)
|
|
assert tree[0]["title"] == "Chapter 1"
|
|
assert len(tree[0]["nodes"]) == 2 # 2 sections under chapter 1
|
|
assert tree[0]["nodes"][0]["title"] == "Section 1.1"
|
|
assert tree[0]["nodes"][1]["title"] == "Section 1.2"
|
|
assert tree[1]["title"] == "Chapter 2"
|
|
assert len(tree[1]["nodes"]) == 0
|
|
|
|
|
|
def test_build_tree_from_levels_single_level():
|
|
nodes = [
|
|
ContentNode(content="a", tokens=5, title="A", index=1, level=1),
|
|
ContentNode(content="b", tokens=5, title="B", index=2, level=1),
|
|
]
|
|
tree = build_tree_from_levels(nodes)
|
|
assert len(tree) == 2
|
|
assert tree[0]["title"] == "A"
|
|
assert tree[1]["title"] == "B"
|
|
|
|
|
|
def test_build_tree_from_levels_deep_nesting():
|
|
nodes = [
|
|
ContentNode(content="h1", tokens=5, title="H1", index=1, level=1),
|
|
ContentNode(content="h2", tokens=5, title="H2", index=2, level=2),
|
|
ContentNode(content="h3", tokens=5, title="H3", index=3, level=3),
|
|
]
|
|
tree = build_tree_from_levels(nodes)
|
|
assert len(tree) == 1
|
|
assert tree[0]["title"] == "H1"
|
|
assert len(tree[0]["nodes"]) == 1
|
|
assert tree[0]["nodes"][0]["title"] == "H2"
|
|
assert len(tree[0]["nodes"][0]["nodes"]) == 1
|
|
assert tree[0]["nodes"][0]["nodes"][0]["title"] == "H3"
|
|
|
|
|
|
def test_content_based_pipeline_does_not_raise():
|
|
"""_content_based_pipeline should delegate to tree_parser, not raise NotImplementedError."""
|
|
fake_tree = [{"title": "Intro", "start_index": 1, "end_index": 2, "nodes": []}]
|
|
|
|
async def fake_tree_parser(page_list, opt, doc=None, logger=None):
|
|
return fake_tree
|
|
|
|
page_list = [("Page 1 text", 50), ("Page 2 text", 60)]
|
|
|
|
from types import SimpleNamespace
|
|
opt = SimpleNamespace(model="test-model")
|
|
|
|
with patch("pageindex.index.page_index.tree_parser", new=fake_tree_parser):
|
|
result = asyncio.run(_content_based_pipeline(page_list, opt))
|
|
|
|
assert result == fake_tree
|
|
|
|
|
|
def test_null_logger_methods():
|
|
"""NullLogger should have info/error/debug and not raise."""
|
|
logger = _NullLogger()
|
|
logger.info("test message")
|
|
logger.error("test error")
|
|
logger.debug("test debug")
|
|
logger.info({"key": "value"})
|