PageIndex/pageindex/parser/markdown.py

import re
from pathlib import Path
from .protocol import ContentNode, ParsedDocument
from ..index.utils import count_tokens


class MarkdownParser:
    def supported_extensions(self) -> list[str]:
        return [".md", ".markdown"]

    def parse(self, file_path: str, **kwargs) -> ParsedDocument:
        path = Path(file_path)
        model = kwargs.get("model")

        with open(path, "r", encoding="utf-8") as f:
            content = f.read()

        lines = content.split("\n")
        headers = self._extract_headers(lines)
        nodes = self._build_nodes(headers, lines, model)

        return ParsedDocument(doc_name=path.stem, nodes=nodes)

    def _extract_headers(self, lines: list[str]) -> list[dict]:
        header_pattern = r"^(#{1,6})\s+(.+)$"
        code_block_pattern = r"^```"
        headers = []
        in_code_block = False

        for line_num, line in enumerate(lines, 1):
            stripped = line.strip()
            if re.match(code_block_pattern, stripped):
                in_code_block = not in_code_block
                continue
            if not in_code_block and stripped:
                match = re.match(header_pattern, stripped)
                if match:
                    headers.append({
                        "title": match.group(2).strip(),
                        "level": len(match.group(1)),
                        "line_num": line_num,
                    })
        return headers

    def _build_nodes(self, headers: list[dict], lines: list[str], model: str | None) -> list[ContentNode]:
        nodes = []
        for i, header in enumerate(headers):
            start = header["line_num"] - 1
            end = headers[i + 1]["line_num"] - 1 if i + 1 < len(headers) else len(lines)
            text = "\n".join(lines[start:end]).strip()
            tokens = count_tokens(text, model=model)
            nodes.append(ContentNode(
                content=text,
                tokens=tokens,
                title=header["title"],
                index=header["line_num"],
                level=header["level"],
            ))
        return nodes