mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-04-30 18:46:21 +02:00
feat: add PageIndex SDK with local/cloud dual-mode support (#207)
This commit is contained in:
parent
f2dcffc0b7
commit
c7fe93bb56
45 changed files with 4225 additions and 274 deletions
0
pageindex/parser/__init__.py
Normal file
0
pageindex/parser/__init__.py
Normal file
59
pageindex/parser/markdown.py
Normal file
59
pageindex/parser/markdown.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
import re
|
||||
from pathlib import Path
|
||||
from .protocol import ContentNode, ParsedDocument
|
||||
from ..index.utils import count_tokens
|
||||
|
||||
|
||||
class MarkdownParser:
|
||||
def supported_extensions(self) -> list[str]:
|
||||
return [".md", ".markdown"]
|
||||
|
||||
def parse(self, file_path: str, **kwargs) -> ParsedDocument:
|
||||
path = Path(file_path)
|
||||
model = kwargs.get("model")
|
||||
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
lines = content.split("\n")
|
||||
headers = self._extract_headers(lines)
|
||||
nodes = self._build_nodes(headers, lines, model)
|
||||
|
||||
return ParsedDocument(doc_name=path.stem, nodes=nodes)
|
||||
|
||||
def _extract_headers(self, lines: list[str]) -> list[dict]:
|
||||
header_pattern = r"^(#{1,6})\s+(.+)$"
|
||||
code_block_pattern = r"^```"
|
||||
headers = []
|
||||
in_code_block = False
|
||||
|
||||
for line_num, line in enumerate(lines, 1):
|
||||
stripped = line.strip()
|
||||
if re.match(code_block_pattern, stripped):
|
||||
in_code_block = not in_code_block
|
||||
continue
|
||||
if not in_code_block and stripped:
|
||||
match = re.match(header_pattern, stripped)
|
||||
if match:
|
||||
headers.append({
|
||||
"title": match.group(2).strip(),
|
||||
"level": len(match.group(1)),
|
||||
"line_num": line_num,
|
||||
})
|
||||
return headers
|
||||
|
||||
def _build_nodes(self, headers: list[dict], lines: list[str], model: str | None) -> list[ContentNode]:
|
||||
nodes = []
|
||||
for i, header in enumerate(headers):
|
||||
start = header["line_num"] - 1
|
||||
end = headers[i + 1]["line_num"] - 1 if i + 1 < len(headers) else len(lines)
|
||||
text = "\n".join(lines[start:end]).strip()
|
||||
tokens = count_tokens(text, model=model)
|
||||
nodes.append(ContentNode(
|
||||
content=text,
|
||||
tokens=tokens,
|
||||
title=header["title"],
|
||||
index=header["line_num"],
|
||||
level=header["level"],
|
||||
))
|
||||
return nodes
|
||||
101
pageindex/parser/pdf.py
Normal file
101
pageindex/parser/pdf.py
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
import pymupdf
|
||||
from pathlib import Path
|
||||
from .protocol import ContentNode, ParsedDocument
|
||||
from ..index.utils import count_tokens
|
||||
|
||||
# Minimum image dimension to keep (skip icons/artifacts)
|
||||
_MIN_IMAGE_SIZE = 32
|
||||
|
||||
|
||||
class PdfParser:
|
||||
def supported_extensions(self) -> list[str]:
|
||||
return [".pdf"]
|
||||
|
||||
def parse(self, file_path: str, **kwargs) -> ParsedDocument:
|
||||
path = Path(file_path)
|
||||
model = kwargs.get("model")
|
||||
images_dir = kwargs.get("images_dir")
|
||||
nodes = []
|
||||
|
||||
with pymupdf.open(str(path)) as doc:
|
||||
for i, page in enumerate(doc):
|
||||
page_num = i + 1
|
||||
if images_dir:
|
||||
content, images = self._extract_page_with_images(
|
||||
doc, page, page_num, images_dir)
|
||||
else:
|
||||
content = page.get_text()
|
||||
images = None
|
||||
|
||||
tokens = count_tokens(content, model=model)
|
||||
nodes.append(ContentNode(
|
||||
content=content or "",
|
||||
tokens=tokens,
|
||||
index=page_num,
|
||||
images=images if images else None,
|
||||
))
|
||||
|
||||
return ParsedDocument(doc_name=path.stem, nodes=nodes)
|
||||
|
||||
@staticmethod
|
||||
def _extract_page_with_images(doc, page, page_num: int,
|
||||
images_dir: str) -> tuple[str, list[dict]]:
|
||||
"""Extract text and images from a page, preserving their relative order.
|
||||
|
||||
Uses get_text("dict") to iterate blocks in reading order.
|
||||
Text blocks become text; image blocks are saved to disk and replaced
|
||||
with an inline placeholder: 
|
||||
"""
|
||||
images_path = Path(images_dir)
|
||||
images_path.mkdir(parents=True, exist_ok=True)
|
||||
# Use path relative to cwd so downstream consumers can access directly
|
||||
try:
|
||||
rel_images_path = images_path.relative_to(Path.cwd())
|
||||
except ValueError:
|
||||
rel_images_path = images_path
|
||||
|
||||
parts: list[str] = []
|
||||
images: list[dict] = []
|
||||
img_idx = 0
|
||||
|
||||
for block in page.get_text("dict")["blocks"]:
|
||||
if block["type"] == 0: # text block
|
||||
lines = []
|
||||
for line in block["lines"]:
|
||||
spans_text = "".join(span["text"] for span in line["spans"])
|
||||
lines.append(spans_text)
|
||||
parts.append("\n".join(lines))
|
||||
|
||||
elif block["type"] == 1: # image block
|
||||
width = block.get("width", 0)
|
||||
height = block.get("height", 0)
|
||||
if width < _MIN_IMAGE_SIZE or height < _MIN_IMAGE_SIZE:
|
||||
continue
|
||||
|
||||
image_bytes = block.get("image")
|
||||
ext = block.get("ext", "png")
|
||||
if not image_bytes:
|
||||
continue
|
||||
|
||||
try:
|
||||
pix = pymupdf.Pixmap(image_bytes)
|
||||
if pix.n > 4:
|
||||
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
|
||||
filename = f"p{page_num}_img{img_idx}.png"
|
||||
save_path = images_path / filename
|
||||
pix.save(str(save_path))
|
||||
pix = None
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
rel_path = str(rel_images_path / filename)
|
||||
images.append({
|
||||
"path": rel_path,
|
||||
"width": width,
|
||||
"height": height,
|
||||
})
|
||||
parts.append(f"")
|
||||
img_idx += 1
|
||||
|
||||
content = "\n".join(parts)
|
||||
return content, images
|
||||
28
pageindex/parser/protocol.py
Normal file
28
pageindex/parser/protocol.py
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from typing import Protocol, runtime_checkable
|
||||
|
||||
|
||||
@dataclass
|
||||
class ContentNode:
|
||||
"""Universal content unit produced by parsers."""
|
||||
content: str
|
||||
tokens: int
|
||||
title: str | None = None
|
||||
index: int | None = None
|
||||
level: int | None = None
|
||||
images: list[dict] | None = None # [{"path": str, "width": int, "height": int}, ...]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedDocument:
|
||||
"""Unified parser output. Always a flat list of ContentNode."""
|
||||
doc_name: str
|
||||
nodes: list[ContentNode]
|
||||
metadata: dict | None = None
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class DocumentParser(Protocol):
|
||||
def supported_extensions(self) -> list[str]: ...
|
||||
def parse(self, file_path: str, **kwargs) -> ParsedDocument: ...
|
||||
Loading…
Add table
Add a link
Reference in a new issue