From 8d413ea5c2b644e86de2f626b8ba461cfe4baa42 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 18:52:57 +0200 Subject: [PATCH] refactor(indexing): expose chunk_markdown and embed_batch helpers Split _compute so the incremental edit path can reuse the exact same chunker selection and embedding entry points (and their test patch targets) without going through the doc-level cache. --- .../cache/cached_indexing.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py b/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py index c93f2f133..95321a229 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py +++ b/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py @@ -58,7 +58,9 @@ async def build_chunk_embeddings( cached = await _recall(key) if cached is not None: metrics.record_embedding_cache_lookup( - embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="hit" + embedding_model=key.embedding_model, + chunker_kind=chunker_kind, + outcome="hit", ) logger.debug("Embedding cache hit for %s", key.markdown_sha256) return cached.summary_embedding, [(c.text, c.embedding) for c in cached.chunks] @@ -73,18 +75,24 @@ async def build_chunk_embeddings( return summary_embedding, chunk_pairs +async def chunk_markdown(markdown: str, *, use_code_chunker: bool) -> list[str]: + """Chunk markdown into ordered texts with the pipeline's chunker selection.""" + if use_code_chunker: + return await asyncio.to_thread(chunk_text, markdown, use_code_chunker=True) + # Table-aware hybrid chunker keeps Markdown tables intact (issue #1334). + return await asyncio.to_thread(chunk_text_hybrid, markdown) + + +async def embed_batch(texts: list[str]) -> list[np.ndarray]: + """Embed texts in one batch off the event loop.""" + return await asyncio.to_thread(embed_texts, texts) + + async def _compute( markdown: str, *, use_code_chunker: bool ) -> tuple[np.ndarray, list[ChunkPair]]: - if use_code_chunker: - chunk_texts = await asyncio.to_thread( - chunk_text, markdown, use_code_chunker=True - ) - else: - # Table-aware hybrid chunker keeps Markdown tables intact (issue #1334). - chunk_texts = await asyncio.to_thread(chunk_text_hybrid, markdown) - - embeddings = await asyncio.to_thread(embed_texts, [markdown, *chunk_texts]) + chunk_texts = await chunk_markdown(markdown, use_code_chunker=use_code_chunker) + embeddings = await embed_batch([markdown, *chunk_texts]) summary_embedding, *chunk_embeddings = embeddings return summary_embedding, list(zip(chunk_texts, chunk_embeddings, strict=False))