feat(chunker): add table-aware chunk_text_hybrid to prevent mid-row table splits

Document_chunker currently splits Markdown tables mid-row when the table is larger than a single chunk window, producing garbled rows that are useless for RAG retrieval (issue #1334). Changes: - document_chunker.py: add chunk_text_hybrid() that detects Markdown table blocks with a regex, emits each table as an indivisible single chunk, and feeds the surrounding prose through the normal chunk_text() chunker. - indexing_pipeline_service.py: route normal (non-code) documents through chunk_text_hybrid instead of chunk_text so tables are protected by default. Fixes #1334
2026-05-09 07:42:39 +02:00 · 2026-05-05 12:48:04 +08:00 · 2026-05-05 12:48:04 +08:00 · 2f3a33c9d5
commit 2f3a33c9d5
parent e5c00221c8
2 changed files with 64 additions and 6 deletions
--- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
+++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
@ -19,7 +19,7 @@ from app.db import (
    DocumentType,
 )
 from app.indexing_pipeline.connector_document import ConnectorDocument
-from app.indexing_pipeline.document_chunker import chunk_text
+from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid
 from app.indexing_pipeline.document_embedder import embed_texts
 from app.indexing_pipeline.document_hashing import (
    compute_content_hash,
@ -387,11 +387,19 @@ class IndexingPipelineService:
            )

            t_step = time.perf_counter()
-            chunk_texts = await asyncio.to_thread(
-                chunk_text,
-                connector_doc.source_markdown,
-                use_code_chunker=connector_doc.should_use_code_chunker,
-            )
+            if connector_doc.should_use_code_chunker:
+                chunk_texts = await asyncio.to_thread(
+                    chunk_text,
+                    connector_doc.source_markdown,
+                    use_code_chunker=True,
+                )
+            else:
+                # Use the table-aware hybrid chunker so Markdown tables are not
+                # split mid-row (see issue #1334).
+                chunk_texts = await asyncio.to_thread(
+                    chunk_text_hybrid,
+                    connector_doc.source_markdown,
+                )

            texts_to_embed = [content, *chunk_texts]
            embeddings = await asyncio.to_thread(embed_texts, texts_to_embed)