mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-04-27 00:56:21 +02:00
feat: add PageIndex SDK with local/cloud dual-mode support (#207)
This commit is contained in:
parent
f2dcffc0b7
commit
c7fe93bb56
45 changed files with 4225 additions and 274 deletions
28
pageindex/parser/protocol.py
Normal file
28
pageindex/parser/protocol.py
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from typing import Protocol, runtime_checkable
|
||||
|
||||
|
||||
@dataclass
|
||||
class ContentNode:
|
||||
"""Universal content unit produced by parsers."""
|
||||
content: str
|
||||
tokens: int
|
||||
title: str | None = None
|
||||
index: int | None = None
|
||||
level: int | None = None
|
||||
images: list[dict] | None = None # [{"path": str, "width": int, "height": int}, ...]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedDocument:
|
||||
"""Unified parser output. Always a flat list of ContentNode."""
|
||||
doc_name: str
|
||||
nodes: list[ContentNode]
|
||||
metadata: dict | None = None
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class DocumentParser(Protocol):
|
||||
def supported_extensions(self) -> list[str]: ...
|
||||
def parse(self, file_path: str, **kwargs) -> ParsedDocument: ...
|
||||
Loading…
Add table
Add a link
Reference in a new issue