feat: add PageIndex SDK with local/cloud dual-mode support (#207)

This commit is contained in:
Kylin 2026-04-06 22:51:04 +08:00 committed by Ray
parent f2dcffc0b7
commit c7fe93bb56
45 changed files with 4225 additions and 274 deletions

29
tests/test_pdf_parser.py Normal file
View file

@ -0,0 +1,29 @@
import pytest
from pathlib import Path
from pageindex.parser.pdf import PdfParser
from pageindex.parser.protocol import ContentNode, ParsedDocument
TEST_PDF = Path("tests/pdfs/deepseek-r1.pdf")
def test_supported_extensions():
parser = PdfParser()
assert ".pdf" in parser.supported_extensions()
@pytest.mark.skipif(not TEST_PDF.exists(), reason="Test PDF not available")
def test_parse_returns_parsed_document():
parser = PdfParser()
result = parser.parse(str(TEST_PDF))
assert isinstance(result, ParsedDocument)
assert len(result.nodes) > 0
assert result.doc_name != ""
@pytest.mark.skipif(not TEST_PDF.exists(), reason="Test PDF not available")
def test_parse_nodes_are_flat_without_level():
parser = PdfParser()
result = parser.parse(str(TEST_PDF))
for node in result.nodes:
assert isinstance(node, ContentNode)
assert node.content is not None
assert node.tokens >= 0
assert node.index is not None
assert node.level is None