mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-09 07:42:39 +02:00
feat(chunker): add table-aware chunk_text_hybrid to prevent mid-row table splits
Document_chunker currently splits Markdown tables mid-row when the table is larger than a single chunk window, producing garbled rows that are useless for RAG retrieval (issue #1334). Changes: - document_chunker.py: add chunk_text_hybrid() that detects Markdown table blocks with a regex, emits each table as an indivisible single chunk, and feeds the surrounding prose through the normal chunk_text() chunker. - indexing_pipeline_service.py: route normal (non-code) documents through chunk_text_hybrid instead of chunk_text so tables are protected by default. Fixes #1334
This commit is contained in:
parent
e5c00221c8
commit
2f3a33c9d5
2 changed files with 64 additions and 6 deletions
|
|
@ -19,7 +19,7 @@ from app.db import (
|
|||
DocumentType,
|
||||
)
|
||||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||
from app.indexing_pipeline.document_chunker import chunk_text
|
||||
from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid
|
||||
from app.indexing_pipeline.document_embedder import embed_texts
|
||||
from app.indexing_pipeline.document_hashing import (
|
||||
compute_content_hash,
|
||||
|
|
@ -387,11 +387,19 @@ class IndexingPipelineService:
|
|||
)
|
||||
|
||||
t_step = time.perf_counter()
|
||||
chunk_texts = await asyncio.to_thread(
|
||||
chunk_text,
|
||||
connector_doc.source_markdown,
|
||||
use_code_chunker=connector_doc.should_use_code_chunker,
|
||||
)
|
||||
if connector_doc.should_use_code_chunker:
|
||||
chunk_texts = await asyncio.to_thread(
|
||||
chunk_text,
|
||||
connector_doc.source_markdown,
|
||||
use_code_chunker=True,
|
||||
)
|
||||
else:
|
||||
# Use the table-aware hybrid chunker so Markdown tables are not
|
||||
# split mid-row (see issue #1334).
|
||||
chunk_texts = await asyncio.to_thread(
|
||||
chunk_text_hybrid,
|
||||
connector_doc.source_markdown,
|
||||
)
|
||||
|
||||
texts_to_embed = [content, *chunk_texts]
|
||||
embeddings = await asyncio.to_thread(embed_texts, texts_to_embed)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue