mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-06 14:22:47 +02:00
Document_chunker currently splits Markdown tables mid-row when the table is larger than a single chunk window, producing garbled rows that are useless for RAG retrieval (issue #1334). Changes: - document_chunker.py: add chunk_text_hybrid() that detects Markdown table blocks with a regex, emits each table as an indivisible single chunk, and feeds the surrounding prose through the normal chunk_text() chunker. - indexing_pipeline_service.py: route normal (non-code) documents through chunk_text_hybrid instead of chunk_text so tables are protected by default. Fixes #1334
59 lines
2 KiB
Python
59 lines
2 KiB
Python
import re
|
|
|
|
from app.config import config
|
|
|
|
# Regex that matches a Markdown table block (header + separator + one or more rows)
|
|
# A table block starts with a | at the beginning of a line and ends when a
|
|
# non-table line (or end of string) is encountered.
|
|
_TABLE_BLOCK_RE = re.compile(
|
|
r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*\n)+)",
|
|
re.MULTILINE,
|
|
)
|
|
|
|
|
|
def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
|
|
"""Chunk a text string using the configured chunker and return the chunk texts."""
|
|
chunker = (
|
|
config.code_chunker_instance if use_code_chunker else config.chunker_instance
|
|
)
|
|
return [c.text for c in chunker.chunk(text)]
|
|
|
|
|
|
def chunk_text_hybrid(text: str) -> list[str]:
|
|
"""Table-aware chunker that prevents Markdown tables from being split mid-row.
|
|
|
|
Algorithm:
|
|
1. Scan the document for Markdown table blocks.
|
|
2. Each table block is emitted as a single, unmodified chunk so that its
|
|
header, separator row, and data rows always stay together.
|
|
3. The non-table prose segments between (and around) tables are passed through
|
|
the normal ``chunk_text`` chunker and their sub-chunks are interleaved in
|
|
document order.
|
|
|
|
This ensures that table data is never sliced in the middle by the token-based
|
|
chunker, which would otherwise produce garbled rows that are useless for RAG.
|
|
|
|
Fixes #1334.
|
|
"""
|
|
chunks: list[str] = []
|
|
cursor = 0
|
|
|
|
for match in _TABLE_BLOCK_RE.finditer(text):
|
|
# Prose before this table
|
|
prose = text[cursor : match.start()].strip()
|
|
if prose:
|
|
chunks.extend(chunk_text(prose))
|
|
|
|
# The table itself is kept as one indivisible chunk
|
|
table_block = match.group(0).strip()
|
|
if table_block:
|
|
chunks.append(table_block)
|
|
|
|
cursor = match.end()
|
|
|
|
# Remaining prose after the last table (or entire text if no tables)
|
|
trailing = text[cursor:].strip()
|
|
if trailing:
|
|
chunks.extend(chunk_text(trailing))
|
|
|
|
return chunks
|