mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-22 21:28:12 +02:00
feat: add lossless span-aware chunk_markdown_with_spans
This commit is contained in:
parent
1048490ba8
commit
0ab773cbcd
1 changed files with 68 additions and 32 deletions
|
|
@ -1,16 +1,30 @@
|
||||||
import re
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
from app.config import config
|
from app.config import config
|
||||||
|
|
||||||
# Regex that matches a Markdown table block (header + separator + one or more rows)
|
# Regex that matches a Markdown table block (header + separator + one or more rows)
|
||||||
# A table block starts with a | at the beginning of a line and ends when a
|
# A table block starts with a | at the beginning of a line and ends when a
|
||||||
# non-table line (or end of string) is encountered.
|
# non-table line (or end of string) is encountered. The final row may end at EOF
|
||||||
|
# without a trailing newline, so the whole table stays one slice.
|
||||||
_TABLE_BLOCK_RE = re.compile(
|
_TABLE_BLOCK_RE = re.compile(
|
||||||
r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*\n)+)",
|
r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*(?:\n|$))+)",
|
||||||
re.MULTILINE,
|
re.MULTILINE,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True, slots=True)
|
||||||
|
class ChunkSlice:
|
||||||
|
"""A chunk paired with its half-open char span into the source markdown.
|
||||||
|
|
||||||
|
Invariant: ``markdown[start_char:end_char] == text``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
text: str
|
||||||
|
start_char: int
|
||||||
|
end_char: int
|
||||||
|
|
||||||
|
|
||||||
def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
|
def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
|
||||||
"""Chunk a text string using the configured chunker and return the chunk texts."""
|
"""Chunk a text string using the configured chunker and return the chunk texts."""
|
||||||
chunker = (
|
chunker = (
|
||||||
|
|
@ -19,41 +33,63 @@ def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
|
||||||
return [c.text for c in chunker.chunk(text)]
|
return [c.text for c in chunker.chunk(text)]
|
||||||
|
|
||||||
|
|
||||||
def chunk_text_hybrid(text: str) -> list[str]:
|
def chunk_markdown_with_spans(
|
||||||
"""Table-aware chunker that prevents Markdown tables from being split mid-row.
|
text: str, use_code_chunker: bool = False
|
||||||
|
) -> list[ChunkSlice]:
|
||||||
|
"""Chunk markdown into a lossless, contiguous partition of char-addressed slices.
|
||||||
|
|
||||||
Algorithm:
|
Tables stay whole (issue #1334) and every slice is an exact substring of
|
||||||
1. Scan the document for Markdown table blocks.
|
``text``, so ``"".join(s.text) == text`` and ``text[s:e] == s.text``. This is
|
||||||
2. Each table block is emitted as a single, unmodified chunk so that its
|
the offset record citations resolve against.
|
||||||
header, separator row, and data rows always stay together.
|
|
||||||
3. The non-table prose segments between (and around) tables are passed through
|
|
||||||
the normal ``chunk_text`` chunker and their sub-chunks are interleaved in
|
|
||||||
document order.
|
|
||||||
|
|
||||||
This ensures that table data is never sliced in the middle by the token-based
|
|
||||||
chunker, which would otherwise produce garbled rows that are useless for RAG.
|
|
||||||
|
|
||||||
Fixes #1334.
|
|
||||||
"""
|
"""
|
||||||
chunks: list[str] = []
|
if not text:
|
||||||
|
return []
|
||||||
|
|
||||||
|
slices: list[ChunkSlice] = []
|
||||||
cursor = 0
|
cursor = 0
|
||||||
|
|
||||||
for match in _TABLE_BLOCK_RE.finditer(text):
|
for match in _TABLE_BLOCK_RE.finditer(text):
|
||||||
# Prose before this table
|
if match.start() > cursor:
|
||||||
prose = text[cursor : match.start()].strip()
|
slices.extend(
|
||||||
if prose:
|
_segment_slices(text, cursor, match.start(), use_code_chunker)
|
||||||
chunks.extend(chunk_text(prose))
|
)
|
||||||
|
slices.append(ChunkSlice(match.group(0), match.start(), match.end()))
|
||||||
# The table itself is kept as one indivisible chunk
|
|
||||||
table_block = match.group(0).strip()
|
|
||||||
if table_block:
|
|
||||||
chunks.append(table_block)
|
|
||||||
|
|
||||||
cursor = match.end()
|
cursor = match.end()
|
||||||
|
|
||||||
# Remaining prose after the last table (or entire text if no tables)
|
if len(text) > cursor:
|
||||||
trailing = text[cursor:].strip()
|
slices.extend(_segment_slices(text, cursor, len(text), use_code_chunker))
|
||||||
if trailing:
|
|
||||||
chunks.extend(chunk_text(trailing))
|
|
||||||
|
|
||||||
return chunks
|
return slices
|
||||||
|
|
||||||
|
|
||||||
|
def _segment_slices(
|
||||||
|
text: str, start: int, end: int, use_code_chunker: bool
|
||||||
|
) -> list[ChunkSlice]:
|
||||||
|
"""Sub-chunk one non-table segment into contiguous, char-addressed slices."""
|
||||||
|
chunker = (
|
||||||
|
config.code_chunker_instance if use_code_chunker else config.chunker_instance
|
||||||
|
)
|
||||||
|
segment = text[start:end]
|
||||||
|
chunks = chunker.chunk(segment)
|
||||||
|
|
||||||
|
slices: list[ChunkSlice] = []
|
||||||
|
local = 0
|
||||||
|
for chunk in chunks:
|
||||||
|
# Use the chunker's end offset only as a cut point, then re-slice the
|
||||||
|
# segment ourselves so the result is an exact, gap-free substring.
|
||||||
|
local_end = min(max(chunk.end_index, local), len(segment))
|
||||||
|
if local_end <= local:
|
||||||
|
continue
|
||||||
|
slices.append(
|
||||||
|
ChunkSlice(segment[local:local_end], start + local, start + local_end)
|
||||||
|
)
|
||||||
|
local = local_end
|
||||||
|
|
||||||
|
if local < len(segment):
|
||||||
|
if slices:
|
||||||
|
last = slices[-1]
|
||||||
|
slices[-1] = ChunkSlice(text[last.start_char : end], last.start_char, end)
|
||||||
|
else:
|
||||||
|
slices.append(ChunkSlice(segment[local:], start + local, end))
|
||||||
|
|
||||||
|
return slices
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue