PageIndex/pageindex/parser/pdf.py

101 lines
3.6 KiB
Python

import pymupdf
from pathlib import Path
from .protocol import ContentNode, ParsedDocument
from ..index.utils import count_tokens
# Minimum image dimension to keep (skip icons/artifacts)
_MIN_IMAGE_SIZE = 32
class PdfParser:
def supported_extensions(self) -> list[str]:
return [".pdf"]
def parse(self, file_path: str, **kwargs) -> ParsedDocument:
path = Path(file_path)
model = kwargs.get("model")
images_dir = kwargs.get("images_dir")
nodes = []
with pymupdf.open(str(path)) as doc:
for i, page in enumerate(doc):
page_num = i + 1
if images_dir:
content, images = self._extract_page_with_images(
doc, page, page_num, images_dir)
else:
content = page.get_text()
images = None
tokens = count_tokens(content, model=model)
nodes.append(ContentNode(
content=content or "",
tokens=tokens,
index=page_num,
images=images if images else None,
))
return ParsedDocument(doc_name=path.stem, nodes=nodes)
@staticmethod
def _extract_page_with_images(doc, page, page_num: int,
images_dir: str) -> tuple[str, list[dict]]:
"""Extract text and images from a page, preserving their relative order.
Uses get_text("dict") to iterate blocks in reading order.
Text blocks become text; image blocks are saved to disk and replaced
with an inline placeholder: ![image](path)
"""
images_path = Path(images_dir)
images_path.mkdir(parents=True, exist_ok=True)
# Use path relative to cwd so downstream consumers can access directly
try:
rel_images_path = images_path.relative_to(Path.cwd())
except ValueError:
rel_images_path = images_path
parts: list[str] = []
images: list[dict] = []
img_idx = 0
for block in page.get_text("dict")["blocks"]:
if block["type"] == 0: # text block
lines = []
for line in block["lines"]:
spans_text = "".join(span["text"] for span in line["spans"])
lines.append(spans_text)
parts.append("\n".join(lines))
elif block["type"] == 1: # image block
width = block.get("width", 0)
height = block.get("height", 0)
if width < _MIN_IMAGE_SIZE or height < _MIN_IMAGE_SIZE:
continue
image_bytes = block.get("image")
ext = block.get("ext", "png")
if not image_bytes:
continue
try:
pix = pymupdf.Pixmap(image_bytes)
if pix.n > 4:
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
filename = f"p{page_num}_img{img_idx}.png"
save_path = images_path / filename
pix.save(str(save_path))
pix = None
except Exception:
continue
rel_path = str(rel_images_path / filename)
images.append({
"path": rel_path,
"width": width,
"height": height,
})
parts.append(f"![image]({rel_path})")
img_idx += 1
content = "\n".join(parts)
return content, images