feat: add PageIndex SDK with local/cloud dual-mode support (#207)

This commit is contained in:
Kylin 2026-04-06 22:51:04 +08:00 committed by Ray
parent f2dcffc0b7
commit c7fe93bb56
45 changed files with 4225 additions and 274 deletions

View file

View file

@ -0,0 +1,59 @@
import re
from pathlib import Path
from .protocol import ContentNode, ParsedDocument
from ..index.utils import count_tokens
class MarkdownParser:
def supported_extensions(self) -> list[str]:
return [".md", ".markdown"]
def parse(self, file_path: str, **kwargs) -> ParsedDocument:
path = Path(file_path)
model = kwargs.get("model")
with open(path, "r", encoding="utf-8") as f:
content = f.read()
lines = content.split("\n")
headers = self._extract_headers(lines)
nodes = self._build_nodes(headers, lines, model)
return ParsedDocument(doc_name=path.stem, nodes=nodes)
def _extract_headers(self, lines: list[str]) -> list[dict]:
header_pattern = r"^(#{1,6})\s+(.+)$"
code_block_pattern = r"^```"
headers = []
in_code_block = False
for line_num, line in enumerate(lines, 1):
stripped = line.strip()
if re.match(code_block_pattern, stripped):
in_code_block = not in_code_block
continue
if not in_code_block and stripped:
match = re.match(header_pattern, stripped)
if match:
headers.append({
"title": match.group(2).strip(),
"level": len(match.group(1)),
"line_num": line_num,
})
return headers
def _build_nodes(self, headers: list[dict], lines: list[str], model: str | None) -> list[ContentNode]:
nodes = []
for i, header in enumerate(headers):
start = header["line_num"] - 1
end = headers[i + 1]["line_num"] - 1 if i + 1 < len(headers) else len(lines)
text = "\n".join(lines[start:end]).strip()
tokens = count_tokens(text, model=model)
nodes.append(ContentNode(
content=text,
tokens=tokens,
title=header["title"],
index=header["line_num"],
level=header["level"],
))
return nodes

101
pageindex/parser/pdf.py Normal file
View file

@ -0,0 +1,101 @@
import pymupdf
from pathlib import Path
from .protocol import ContentNode, ParsedDocument
from ..index.utils import count_tokens
# Minimum image dimension to keep (skip icons/artifacts)
_MIN_IMAGE_SIZE = 32
class PdfParser:
def supported_extensions(self) -> list[str]:
return [".pdf"]
def parse(self, file_path: str, **kwargs) -> ParsedDocument:
path = Path(file_path)
model = kwargs.get("model")
images_dir = kwargs.get("images_dir")
nodes = []
with pymupdf.open(str(path)) as doc:
for i, page in enumerate(doc):
page_num = i + 1
if images_dir:
content, images = self._extract_page_with_images(
doc, page, page_num, images_dir)
else:
content = page.get_text()
images = None
tokens = count_tokens(content, model=model)
nodes.append(ContentNode(
content=content or "",
tokens=tokens,
index=page_num,
images=images if images else None,
))
return ParsedDocument(doc_name=path.stem, nodes=nodes)
@staticmethod
def _extract_page_with_images(doc, page, page_num: int,
images_dir: str) -> tuple[str, list[dict]]:
"""Extract text and images from a page, preserving their relative order.
Uses get_text("dict") to iterate blocks in reading order.
Text blocks become text; image blocks are saved to disk and replaced
with an inline placeholder: ![image](path)
"""
images_path = Path(images_dir)
images_path.mkdir(parents=True, exist_ok=True)
# Use path relative to cwd so downstream consumers can access directly
try:
rel_images_path = images_path.relative_to(Path.cwd())
except ValueError:
rel_images_path = images_path
parts: list[str] = []
images: list[dict] = []
img_idx = 0
for block in page.get_text("dict")["blocks"]:
if block["type"] == 0: # text block
lines = []
for line in block["lines"]:
spans_text = "".join(span["text"] for span in line["spans"])
lines.append(spans_text)
parts.append("\n".join(lines))
elif block["type"] == 1: # image block
width = block.get("width", 0)
height = block.get("height", 0)
if width < _MIN_IMAGE_SIZE or height < _MIN_IMAGE_SIZE:
continue
image_bytes = block.get("image")
ext = block.get("ext", "png")
if not image_bytes:
continue
try:
pix = pymupdf.Pixmap(image_bytes)
if pix.n > 4:
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
filename = f"p{page_num}_img{img_idx}.png"
save_path = images_path / filename
pix.save(str(save_path))
pix = None
except Exception:
continue
rel_path = str(rel_images_path / filename)
images.append({
"path": rel_path,
"width": width,
"height": height,
})
parts.append(f"![image]({rel_path})")
img_idx += 1
content = "\n".join(parts)
return content, images

View file

@ -0,0 +1,28 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Protocol, runtime_checkable
@dataclass
class ContentNode:
"""Universal content unit produced by parsers."""
content: str
tokens: int
title: str | None = None
index: int | None = None
level: int | None = None
images: list[dict] | None = None # [{"path": str, "width": int, "height": int}, ...]
@dataclass
class ParsedDocument:
"""Unified parser output. Always a flat list of ContentNode."""
doc_name: str
nodes: list[ContentNode]
metadata: dict | None = None
@runtime_checkable
class DocumentParser(Protocol):
def supported_extensions(self) -> list[str]: ...
def parse(self, file_path: str, **kwargs) -> ParsedDocument: ...