feat(chunker): add table-aware chunk_text_hybrid to prevent mid-row table splits

Document_chunker currently splits Markdown tables mid-row when the table is
larger than a single chunk window, producing garbled rows that are useless
for RAG retrieval (issue #1334).

Changes:
- document_chunker.py: add chunk_text_hybrid() that detects Markdown table
  blocks with a regex, emits each table as an indivisible single chunk, and
  feeds the surrounding prose through the normal chunk_text() chunker.
- indexing_pipeline_service.py: route normal (non-code) documents through
  chunk_text_hybrid instead of chunk_text so tables are protected by default.

Fixes #1334
This commit is contained in:
guangyang1206 2026-05-05 12:48:04 +08:00
parent e5c00221c8
commit 2f3a33c9d5
2 changed files with 64 additions and 6 deletions

View file

@ -19,7 +19,7 @@ from app.db import (
DocumentType,
)
from app.indexing_pipeline.connector_document import ConnectorDocument
from app.indexing_pipeline.document_chunker import chunk_text
from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid
from app.indexing_pipeline.document_embedder import embed_texts
from app.indexing_pipeline.document_hashing import (
compute_content_hash,
@ -387,11 +387,19 @@ class IndexingPipelineService:
)
t_step = time.perf_counter()
chunk_texts = await asyncio.to_thread(
chunk_text,
connector_doc.source_markdown,
use_code_chunker=connector_doc.should_use_code_chunker,
)
if connector_doc.should_use_code_chunker:
chunk_texts = await asyncio.to_thread(
chunk_text,
connector_doc.source_markdown,
use_code_chunker=True,
)
else:
# Use the table-aware hybrid chunker so Markdown tables are not
# split mid-row (see issue #1334).
chunk_texts = await asyncio.to_thread(
chunk_text_hybrid,
connector_doc.source_markdown,
)
texts_to_embed = [content, *chunk_texts]
embeddings = await asyncio.to_thread(embed_texts, texts_to_embed)