mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-04-28 09:36:21 +02:00
59 lines
2.1 KiB
Python
59 lines
2.1 KiB
Python
import re
|
|
from pathlib import Path
|
|
from .protocol import ContentNode, ParsedDocument
|
|
from ..index.utils import count_tokens
|
|
|
|
|
|
class MarkdownParser:
|
|
def supported_extensions(self) -> list[str]:
|
|
return [".md", ".markdown"]
|
|
|
|
def parse(self, file_path: str, **kwargs) -> ParsedDocument:
|
|
path = Path(file_path)
|
|
model = kwargs.get("model")
|
|
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
lines = content.split("\n")
|
|
headers = self._extract_headers(lines)
|
|
nodes = self._build_nodes(headers, lines, model)
|
|
|
|
return ParsedDocument(doc_name=path.stem, nodes=nodes)
|
|
|
|
def _extract_headers(self, lines: list[str]) -> list[dict]:
|
|
header_pattern = r"^(#{1,6})\s+(.+)$"
|
|
code_block_pattern = r"^```"
|
|
headers = []
|
|
in_code_block = False
|
|
|
|
for line_num, line in enumerate(lines, 1):
|
|
stripped = line.strip()
|
|
if re.match(code_block_pattern, stripped):
|
|
in_code_block = not in_code_block
|
|
continue
|
|
if not in_code_block and stripped:
|
|
match = re.match(header_pattern, stripped)
|
|
if match:
|
|
headers.append({
|
|
"title": match.group(2).strip(),
|
|
"level": len(match.group(1)),
|
|
"line_num": line_num,
|
|
})
|
|
return headers
|
|
|
|
def _build_nodes(self, headers: list[dict], lines: list[str], model: str | None) -> list[ContentNode]:
|
|
nodes = []
|
|
for i, header in enumerate(headers):
|
|
start = header["line_num"] - 1
|
|
end = headers[i + 1]["line_num"] - 1 if i + 1 < len(headers) else len(lines)
|
|
text = "\n".join(lines[start:end]).strip()
|
|
tokens = count_tokens(text, model=model)
|
|
nodes.append(ContentNode(
|
|
content=text,
|
|
tokens=tokens,
|
|
title=header["title"],
|
|
index=header["line_num"],
|
|
level=header["level"],
|
|
))
|
|
return nodes
|