diff --git a/surfsense_backend/app/indexing_pipeline/document_chunker.py b/surfsense_backend/app/indexing_pipeline/document_chunker.py index 4f3c698ef..6ae81b7a8 100644 --- a/surfsense_backend/app/indexing_pipeline/document_chunker.py +++ b/surfsense_backend/app/indexing_pipeline/document_chunker.py @@ -1,5 +1,15 @@ +import re + from app.config import config +# Regex that matches a Markdown table block (header + separator + one or more rows) +# A table block starts with a | at the beginning of a line and ends when a +# non-table line (or end of string) is encountered. +_TABLE_BLOCK_RE = re.compile( + r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*\n)+)", + re.MULTILINE, +) + def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]: """Chunk a text string using the configured chunker and return the chunk texts.""" @@ -7,3 +17,43 @@ def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]: config.code_chunker_instance if use_code_chunker else config.chunker_instance ) return [c.text for c in chunker.chunk(text)] + + +def chunk_text_hybrid(text: str) -> list[str]: + """Table-aware chunker that prevents Markdown tables from being split mid-row. + + Algorithm: + 1. Scan the document for Markdown table blocks. + 2. Each table block is emitted as a single, unmodified chunk so that its + header, separator row, and data rows always stay together. + 3. The non-table prose segments between (and around) tables are passed through + the normal ``chunk_text`` chunker and their sub-chunks are interleaved in + document order. + + This ensures that table data is never sliced in the middle by the token-based + chunker, which would otherwise produce garbled rows that are useless for RAG. + + Fixes #1334. + """ + chunks: list[str] = [] + cursor = 0 + + for match in _TABLE_BLOCK_RE.finditer(text): + # Prose before this table + prose = text[cursor : match.start()].strip() + if prose: + chunks.extend(chunk_text(prose)) + + # The table itself is kept as one indivisible chunk + table_block = match.group(0).strip() + if table_block: + chunks.append(table_block) + + cursor = match.end() + + # Remaining prose after the last table (or entire text if no tables) + trailing = text[cursor:].strip() + if trailing: + chunks.extend(chunk_text(trailing)) + + return chunks diff --git a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py index e6b2458f3..2339647ea 100644 --- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py +++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py @@ -19,7 +19,7 @@ from app.db import ( DocumentType, ) from app.indexing_pipeline.connector_document import ConnectorDocument -from app.indexing_pipeline.document_chunker import chunk_text +from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid from app.indexing_pipeline.document_embedder import embed_texts from app.indexing_pipeline.document_hashing import ( compute_content_hash, @@ -387,11 +387,19 @@ class IndexingPipelineService: ) t_step = time.perf_counter() - chunk_texts = await asyncio.to_thread( - chunk_text, - connector_doc.source_markdown, - use_code_chunker=connector_doc.should_use_code_chunker, - ) + if connector_doc.should_use_code_chunker: + chunk_texts = await asyncio.to_thread( + chunk_text, + connector_doc.source_markdown, + use_code_chunker=True, + ) + else: + # Use the table-aware hybrid chunker so Markdown tables are not + # split mid-row (see issue #1334). + chunk_texts = await asyncio.to_thread( + chunk_text_hybrid, + connector_doc.source_markdown, + ) texts_to_embed = [content, *chunk_texts] embeddings = await asyncio.to_thread(embed_texts, texts_to_embed)