feat: add PageIndex SDK with local/cloud dual-mode support (#207)

2026-04-30 18:46:21 +02:00 · 2026-04-06 22:51:04 +08:00 · 2026-04-06 22:51:04 +08:00 · c7fe93bb56
commit c7fe93bb56
parent f2dcffc0b7
45 changed files with 4225 additions and 274 deletions
--- a/pageindex/parser/init.py
+++ b/pageindex/parser/init.py
--- a/pageindex/parser/markdown.py
+++ b/pageindex/parser/markdown.py
@ -0,0 +1,59 @@
+import re
+from pathlib import Path
+from .protocol import ContentNode, ParsedDocument
+from ..index.utils import count_tokens
+
+
+class MarkdownParser:
+    def supported_extensions(self) -> list[str]:
+        return [".md", ".markdown"]
+
+    def parse(self, file_path: str, **kwargs) -> ParsedDocument:
+        path = Path(file_path)
+        model = kwargs.get("model")
+
+        with open(path, "r", encoding="utf-8") as f:
+            content = f.read()
+
+        lines = content.split("\n")
+        headers = self._extract_headers(lines)
+        nodes = self._build_nodes(headers, lines, model)
+
+        return ParsedDocument(doc_name=path.stem, nodes=nodes)
+
+    def _extract_headers(self, lines: list[str]) -> list[dict]:
+        header_pattern = r"^(#{1,6})\s+(.+)$"
+        code_block_pattern = r"^```"
+        headers = []
+        in_code_block = False
+
+        for line_num, line in enumerate(lines, 1):
+            stripped = line.strip()
+            if re.match(code_block_pattern, stripped):
+                in_code_block = not in_code_block
+                continue
+            if not in_code_block and stripped:
+                match = re.match(header_pattern, stripped)
+                if match:
+                    headers.append({
+                        "title": match.group(2).strip(),
+                        "level": len(match.group(1)),
+                        "line_num": line_num,
+                    })
+        return headers
+
+    def _build_nodes(self, headers: list[dict], lines: list[str], model: str | None) -> list[ContentNode]:
+        nodes = []
+        for i, header in enumerate(headers):
+            start = header["line_num"] - 1
+            end = headers[i + 1]["line_num"] - 1 if i + 1 < len(headers) else len(lines)
+            text = "\n".join(lines[start:end]).strip()
+            tokens = count_tokens(text, model=model)
+            nodes.append(ContentNode(
+                content=text,
+                tokens=tokens,
+                title=header["title"],
+                index=header["line_num"],
+                level=header["level"],
+            ))
+        return nodes
--- a/pageindex/parser/pdf.py
+++ b/pageindex/parser/pdf.py
@ -0,0 +1,101 @@
+import pymupdf
+from pathlib import Path
+from .protocol import ContentNode, ParsedDocument
+from ..index.utils import count_tokens
+
+# Minimum image dimension to keep (skip icons/artifacts)
+_MIN_IMAGE_SIZE = 32
+
+
+class PdfParser:
+    def supported_extensions(self) -> list[str]:
+        return [".pdf"]
+
+    def parse(self, file_path: str, **kwargs) -> ParsedDocument:
+        path = Path(file_path)
+        model = kwargs.get("model")
+        images_dir = kwargs.get("images_dir")
+        nodes = []
+
+        with pymupdf.open(str(path)) as doc:
+            for i, page in enumerate(doc):
+                page_num = i + 1
+                if images_dir:
+                    content, images = self._extract_page_with_images(
+                        doc, page, page_num, images_dir)
+                else:
+                    content = page.get_text()
+                    images = None
+
+                tokens = count_tokens(content, model=model)
+                nodes.append(ContentNode(
+                    content=content or "",
+                    tokens=tokens,
+                    index=page_num,
+                    images=images if images else None,
+                ))
+
+        return ParsedDocument(doc_name=path.stem, nodes=nodes)
+
+    @staticmethod
+    def _extract_page_with_images(doc, page, page_num: int,
+                                  images_dir: str) -> tuple[str, list[dict]]:
+        """Extract text and images from a page, preserving their relative order.
+
+        Uses get_text("dict") to iterate blocks in reading order.
+        Text blocks become text; image blocks are saved to disk and replaced
+        with an inline placeholder: ![image](path)
+        """
+        images_path = Path(images_dir)
+        images_path.mkdir(parents=True, exist_ok=True)
+        # Use path relative to cwd so downstream consumers can access directly
+        try:
+            rel_images_path = images_path.relative_to(Path.cwd())
+        except ValueError:
+            rel_images_path = images_path
+
+        parts: list[str] = []
+        images: list[dict] = []
+        img_idx = 0
+
+        for block in page.get_text("dict")["blocks"]:
+            if block["type"] == 0:  # text block
+                lines = []
+                for line in block["lines"]:
+                    spans_text = "".join(span["text"] for span in line["spans"])
+                    lines.append(spans_text)
+                parts.append("\n".join(lines))
+
+            elif block["type"] == 1:  # image block
+                width = block.get("width", 0)
+                height = block.get("height", 0)
+                if width < _MIN_IMAGE_SIZE or height < _MIN_IMAGE_SIZE:
+                    continue
+
+                image_bytes = block.get("image")
+                ext = block.get("ext", "png")
+                if not image_bytes:
+                    continue
+
+                try:
+                    pix = pymupdf.Pixmap(image_bytes)
+                    if pix.n > 4:
+                        pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
+                    filename = f"p{page_num}_img{img_idx}.png"
+                    save_path = images_path / filename
+                    pix.save(str(save_path))
+                    pix = None
+                except Exception:
+                    continue
+
+                rel_path = str(rel_images_path / filename)
+                images.append({
+                    "path": rel_path,
+                    "width": width,
+                    "height": height,
+                })
+                parts.append(f"![image]({rel_path})")
+                img_idx += 1
+
+        content = "\n".join(parts)
+        return content, images
--- a/pageindex/parser/protocol.py
+++ b/pageindex/parser/protocol.py
@ -0,0 +1,28 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Protocol, runtime_checkable
+
+
+@dataclass
+class ContentNode:
+    """Universal content unit produced by parsers."""
+    content: str
+    tokens: int
+    title: str | None = None
+    index: int | None = None
+    level: int | None = None
+    images: list[dict] | None = None  # [{"path": str, "width": int, "height": int}, ...]
+
+
+@dataclass
+class ParsedDocument:
+    """Unified parser output. Always a flat list of ContentNode."""
+    doc_name: str
+    nodes: list[ContentNode]
+    metadata: dict | None = None
+
+
+@runtime_checkable
+class DocumentParser(Protocol):
+    def supported_extensions(self) -> list[str]: ...
+    def parse(self, file_path: str, **kwargs) -> ParsedDocument: ...