mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-04-24 23:56:21 +02:00
28 lines
770 B
Python
28 lines
770 B
Python
from __future__ import annotations
|
|
from dataclasses import dataclass
|
|
from typing import Protocol, runtime_checkable
|
|
|
|
|
|
@dataclass
|
|
class ContentNode:
|
|
"""Universal content unit produced by parsers."""
|
|
content: str
|
|
tokens: int
|
|
title: str | None = None
|
|
index: int | None = None
|
|
level: int | None = None
|
|
images: list[dict] | None = None # [{"path": str, "width": int, "height": int}, ...]
|
|
|
|
|
|
@dataclass
|
|
class ParsedDocument:
|
|
"""Unified parser output. Always a flat list of ContentNode."""
|
|
doc_name: str
|
|
nodes: list[ContentNode]
|
|
metadata: dict | None = None
|
|
|
|
|
|
@runtime_checkable
|
|
class DocumentParser(Protocol):
|
|
def supported_extensions(self) -> list[str]: ...
|
|
def parse(self, file_path: str, **kwargs) -> ParsedDocument: ...
|