PageIndex/pageindex/parser/pdf.py

import pymupdf
from pathlib import Path
from .protocol import ContentNode, ParsedDocument
from ..index.utils import count_tokens

# Minimum image dimension to keep (skip icons/artifacts)
_MIN_IMAGE_SIZE = 32


class PdfParser:
    def supported_extensions(self) -> list[str]:
        return [".pdf"]

    def parse(self, file_path: str, **kwargs) -> ParsedDocument:
        path = Path(file_path)
        model = kwargs.get("model")
        images_dir = kwargs.get("images_dir")
        nodes = []

        with pymupdf.open(str(path)) as doc:
            for i, page in enumerate(doc):
                page_num = i + 1
                if images_dir:
                    content, images = self._extract_page_with_images(
                        doc, page, page_num, images_dir)
                else:
                    content = page.get_text()
                    images = None

                tokens = count_tokens(content, model=model)
                nodes.append(ContentNode(
                    content=content or "",
                    tokens=tokens,
                    index=page_num,
                    images=images if images else None,
                ))

        return ParsedDocument(doc_name=path.stem, nodes=nodes)

    @staticmethod
    def _extract_page_with_images(doc, page, page_num: int,
                                  images_dir: str) -> tuple[str, list[dict]]:
        """Extract text and images from a page, preserving their relative order.

        Uses get_text("dict") to iterate blocks in reading order.
        Text blocks become text; image blocks are saved to disk and replaced
        with an inline placeholder: ![image](path)
        """
        images_path = Path(images_dir)
        images_path.mkdir(parents=True, exist_ok=True)
        # Use path relative to cwd so downstream consumers can access directly
        try:
            rel_images_path = images_path.relative_to(Path.cwd())
        except ValueError:
            rel_images_path = images_path

        parts: list[str] = []
        images: list[dict] = []
        img_idx = 0

        for block in page.get_text("dict")["blocks"]:
            if block["type"] == 0:  # text block
                lines = []
                for line in block["lines"]:
                    spans_text = "".join(span["text"] for span in line["spans"])
                    lines.append(spans_text)
                parts.append("\n".join(lines))

            elif block["type"] == 1:  # image block
                width = block.get("width", 0)
                height = block.get("height", 0)
                if width < _MIN_IMAGE_SIZE or height < _MIN_IMAGE_SIZE:
                    continue

                image_bytes = block.get("image")
                ext = block.get("ext", "png")
                if not image_bytes:
                    continue

                try:
                    pix = pymupdf.Pixmap(image_bytes)
                    if pix.n > 4:
                        pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
                    filename = f"p{page_num}_img{img_idx}.png"
                    save_path = images_path / filename
                    pix.save(str(save_path))
                    pix = None
                except Exception:
                    continue

                rel_path = str(rel_images_path / filename)
                images.append({
                    "path": rel_path,
                    "width": width,
                    "height": height,
                })
                parts.append(f"![image]({rel_path})")
                img_idx += 1

        content = "\n".join(parts)
        return content, images