feat: add lossless span-aware chunk_markdown_with_spans

2026-06-20 21:18:13 +02:00 · 2026-06-18 20:06:26 +02:00 · 2026-06-18 20:06:26 +02:00 · 0ab773cbcd
commit 0ab773cbcd
parent 1048490ba8
1 changed files with 68 additions and 32 deletions
--- a/surfsense_backend/app/indexing_pipeline/document_chunker.py
+++ b/surfsense_backend/app/indexing_pipeline/document_chunker.py
@ -1,16 +1,30 @@
 import re
+from dataclasses import dataclass

 from app.config import config

 # Regex that matches a Markdown table block (header + separator + one or more rows)
 # A table block starts with a | at the beginning of a line and ends when a
-# non-table line (or end of string) is encountered.
+# non-table line (or end of string) is encountered. The final row may end at EOF
+# without a trailing newline, so the whole table stays one slice.
 _TABLE_BLOCK_RE = re.compile(
-    r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*\n)+)",
+    r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*(?:\n|$))+)",
    re.MULTILINE,
 )


+@dataclass(frozen=True, slots=True)
+class ChunkSlice:
+    """A chunk paired with its half-open char span into the source markdown.
+
+    Invariant: ``markdown[start_char:end_char] == text``.
+    """
+
+    text: str
+    start_char: int
+    end_char: int
+
+
 def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
    """Chunk a text string using the configured chunker and return the chunk texts."""
    chunker = (
@ -19,41 +33,63 @@ def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
    return [c.text for c in chunker.chunk(text)]


-def chunk_text_hybrid(text: str) -> list[str]:
-    """Table-aware chunker that prevents Markdown tables from being split mid-row.
+def chunk_markdown_with_spans(
+    text: str, use_code_chunker: bool = False
+) -> list[ChunkSlice]:
+    """Chunk markdown into a lossless, contiguous partition of char-addressed slices.

-    Algorithm:
-    1. Scan the document for Markdown table blocks.
-    2. Each table block is emitted as a single, unmodified chunk so that its
-       header, separator row, and data rows always stay together.
-    3. The non-table prose segments between (and around) tables are passed through
-       the normal ``chunk_text`` chunker and their sub-chunks are interleaved in
-       document order.
-
-    This ensures that table data is never sliced in the middle by the token-based
-    chunker, which would otherwise produce garbled rows that are useless for RAG.
-
-    Fixes #1334.
+    Tables stay whole (issue #1334) and every slice is an exact substring of
+    ``text``, so ``"".join(s.text) == text`` and ``text[s:e] == s.text``. This is
+    the offset record citations resolve against.
    """
-    chunks: list[str] = []
+    if not text:
+        return []
+
+    slices: list[ChunkSlice] = []
    cursor = 0

    for match in _TABLE_BLOCK_RE.finditer(text):
-        # Prose before this table
-        prose = text[cursor : match.start()].strip()
-        if prose:
-            chunks.extend(chunk_text(prose))
-
-        # The table itself is kept as one indivisible chunk
-        table_block = match.group(0).strip()
-        if table_block:
-            chunks.append(table_block)
-
+        if match.start() > cursor:
+            slices.extend(
+                _segment_slices(text, cursor, match.start(), use_code_chunker)
+            )
+        slices.append(ChunkSlice(match.group(0), match.start(), match.end()))
        cursor = match.end()

-    # Remaining prose after the last table (or entire text if no tables)
-    trailing = text[cursor:].strip()
-    if trailing:
-        chunks.extend(chunk_text(trailing))
+    if len(text) > cursor:
+        slices.extend(_segment_slices(text, cursor, len(text), use_code_chunker))

-    return chunks
+    return slices
+
+
+def _segment_slices(
+    text: str, start: int, end: int, use_code_chunker: bool
+) -> list[ChunkSlice]:
+    """Sub-chunk one non-table segment into contiguous, char-addressed slices."""
+    chunker = (
+        config.code_chunker_instance if use_code_chunker else config.chunker_instance
+    )
+    segment = text[start:end]
+    chunks = chunker.chunk(segment)
+
+    slices: list[ChunkSlice] = []
+    local = 0
+    for chunk in chunks:
+        # Use the chunker's end offset only as a cut point, then re-slice the
+        # segment ourselves so the result is an exact, gap-free substring.
+        local_end = min(max(chunk.end_index, local), len(segment))
+        if local_end <= local:
+            continue
+        slices.append(
+            ChunkSlice(segment[local:local_end], start + local, start + local_end)
+        )
+        local = local_end
+
+    if local < len(segment):
+        if slices:
+            last = slices[-1]
+            slices[-1] = ChunkSlice(text[last.start_char : end], last.start_char, end)
+        else:
+            slices.append(ChunkSlice(segment[local:], start + local, end))
+
+    return slices