mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-04-25 08:06:22 +02:00
feat: add PageIndex SDK with local/cloud dual-mode support (#207)
This commit is contained in:
parent
f2dcffc0b7
commit
c7fe93bb56
45 changed files with 4225 additions and 274 deletions
29
tests/test_pdf_parser.py
Normal file
29
tests/test_pdf_parser.py
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
import pytest
|
||||
from pathlib import Path
|
||||
from pageindex.parser.pdf import PdfParser
|
||||
from pageindex.parser.protocol import ContentNode, ParsedDocument
|
||||
|
||||
TEST_PDF = Path("tests/pdfs/deepseek-r1.pdf")
|
||||
|
||||
def test_supported_extensions():
|
||||
parser = PdfParser()
|
||||
assert ".pdf" in parser.supported_extensions()
|
||||
|
||||
@pytest.mark.skipif(not TEST_PDF.exists(), reason="Test PDF not available")
|
||||
def test_parse_returns_parsed_document():
|
||||
parser = PdfParser()
|
||||
result = parser.parse(str(TEST_PDF))
|
||||
assert isinstance(result, ParsedDocument)
|
||||
assert len(result.nodes) > 0
|
||||
assert result.doc_name != ""
|
||||
|
||||
@pytest.mark.skipif(not TEST_PDF.exists(), reason="Test PDF not available")
|
||||
def test_parse_nodes_are_flat_without_level():
|
||||
parser = PdfParser()
|
||||
result = parser.parse(str(TEST_PDF))
|
||||
for node in result.nodes:
|
||||
assert isinstance(node, ContentNode)
|
||||
assert node.content is not None
|
||||
assert node.tokens >= 0
|
||||
assert node.index is not None
|
||||
assert node.level is None
|
||||
Loading…
Add table
Add a link
Reference in a new issue