mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-06 06:12:40 +02:00
Merge pull request #1348 from guangyang1206/feat/document-chunker-table-aware-hybrid-1334
feat(chunker): add table-aware chunk_text_hybrid to prevent mid-row table splits
This commit is contained in:
commit
489dd0aa52
2 changed files with 64 additions and 6 deletions
|
|
@ -1,5 +1,15 @@
|
|||
import re
|
||||
|
||||
from app.config import config
|
||||
|
||||
# Regex that matches a Markdown table block (header + separator + one or more rows)
|
||||
# A table block starts with a | at the beginning of a line and ends when a
|
||||
# non-table line (or end of string) is encountered.
|
||||
_TABLE_BLOCK_RE = re.compile(
|
||||
r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*\n)+)",
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
|
||||
def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
|
||||
"""Chunk a text string using the configured chunker and return the chunk texts."""
|
||||
|
|
@ -7,3 +17,43 @@ def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
|
|||
config.code_chunker_instance if use_code_chunker else config.chunker_instance
|
||||
)
|
||||
return [c.text for c in chunker.chunk(text)]
|
||||
|
||||
|
||||
def chunk_text_hybrid(text: str) -> list[str]:
|
||||
"""Table-aware chunker that prevents Markdown tables from being split mid-row.
|
||||
|
||||
Algorithm:
|
||||
1. Scan the document for Markdown table blocks.
|
||||
2. Each table block is emitted as a single, unmodified chunk so that its
|
||||
header, separator row, and data rows always stay together.
|
||||
3. The non-table prose segments between (and around) tables are passed through
|
||||
the normal ``chunk_text`` chunker and their sub-chunks are interleaved in
|
||||
document order.
|
||||
|
||||
This ensures that table data is never sliced in the middle by the token-based
|
||||
chunker, which would otherwise produce garbled rows that are useless for RAG.
|
||||
|
||||
Fixes #1334.
|
||||
"""
|
||||
chunks: list[str] = []
|
||||
cursor = 0
|
||||
|
||||
for match in _TABLE_BLOCK_RE.finditer(text):
|
||||
# Prose before this table
|
||||
prose = text[cursor : match.start()].strip()
|
||||
if prose:
|
||||
chunks.extend(chunk_text(prose))
|
||||
|
||||
# The table itself is kept as one indivisible chunk
|
||||
table_block = match.group(0).strip()
|
||||
if table_block:
|
||||
chunks.append(table_block)
|
||||
|
||||
cursor = match.end()
|
||||
|
||||
# Remaining prose after the last table (or entire text if no tables)
|
||||
trailing = text[cursor:].strip()
|
||||
if trailing:
|
||||
chunks.extend(chunk_text(trailing))
|
||||
|
||||
return chunks
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ from app.db import (
|
|||
DocumentType,
|
||||
)
|
||||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||
from app.indexing_pipeline.document_chunker import chunk_text
|
||||
from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid
|
||||
from app.indexing_pipeline.document_embedder import embed_texts
|
||||
from app.indexing_pipeline.document_hashing import (
|
||||
compute_content_hash,
|
||||
|
|
@ -387,11 +387,19 @@ class IndexingPipelineService:
|
|||
)
|
||||
|
||||
t_step = time.perf_counter()
|
||||
chunk_texts = await asyncio.to_thread(
|
||||
chunk_text,
|
||||
connector_doc.source_markdown,
|
||||
use_code_chunker=connector_doc.should_use_code_chunker,
|
||||
)
|
||||
if connector_doc.should_use_code_chunker:
|
||||
chunk_texts = await asyncio.to_thread(
|
||||
chunk_text,
|
||||
connector_doc.source_markdown,
|
||||
use_code_chunker=True,
|
||||
)
|
||||
else:
|
||||
# Use the table-aware hybrid chunker so Markdown tables are not
|
||||
# split mid-row (see issue #1334).
|
||||
chunk_texts = await asyncio.to_thread(
|
||||
chunk_text_hybrid,
|
||||
connector_doc.source_markdown,
|
||||
)
|
||||
|
||||
texts_to_embed = [content, *chunk_texts]
|
||||
embeddings = await asyncio.to_thread(embed_texts, texts_to_embed)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue