mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-20 21:18:13 +02:00
refactor(indexing): expose chunk_markdown and embed_batch helpers
Split _compute so the incremental edit path can reuse the exact same chunker selection and embedding entry points (and their test patch targets) without going through the doc-level cache.
This commit is contained in:
parent
f82dedf712
commit
8d413ea5c2
1 changed files with 18 additions and 10 deletions
|
|
@ -58,7 +58,9 @@ async def build_chunk_embeddings(
|
||||||
cached = await _recall(key)
|
cached = await _recall(key)
|
||||||
if cached is not None:
|
if cached is not None:
|
||||||
metrics.record_embedding_cache_lookup(
|
metrics.record_embedding_cache_lookup(
|
||||||
embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="hit"
|
embedding_model=key.embedding_model,
|
||||||
|
chunker_kind=chunker_kind,
|
||||||
|
outcome="hit",
|
||||||
)
|
)
|
||||||
logger.debug("Embedding cache hit for %s", key.markdown_sha256)
|
logger.debug("Embedding cache hit for %s", key.markdown_sha256)
|
||||||
return cached.summary_embedding, [(c.text, c.embedding) for c in cached.chunks]
|
return cached.summary_embedding, [(c.text, c.embedding) for c in cached.chunks]
|
||||||
|
|
@ -73,18 +75,24 @@ async def build_chunk_embeddings(
|
||||||
return summary_embedding, chunk_pairs
|
return summary_embedding, chunk_pairs
|
||||||
|
|
||||||
|
|
||||||
|
async def chunk_markdown(markdown: str, *, use_code_chunker: bool) -> list[str]:
|
||||||
|
"""Chunk markdown into ordered texts with the pipeline's chunker selection."""
|
||||||
|
if use_code_chunker:
|
||||||
|
return await asyncio.to_thread(chunk_text, markdown, use_code_chunker=True)
|
||||||
|
# Table-aware hybrid chunker keeps Markdown tables intact (issue #1334).
|
||||||
|
return await asyncio.to_thread(chunk_text_hybrid, markdown)
|
||||||
|
|
||||||
|
|
||||||
|
async def embed_batch(texts: list[str]) -> list[np.ndarray]:
|
||||||
|
"""Embed texts in one batch off the event loop."""
|
||||||
|
return await asyncio.to_thread(embed_texts, texts)
|
||||||
|
|
||||||
|
|
||||||
async def _compute(
|
async def _compute(
|
||||||
markdown: str, *, use_code_chunker: bool
|
markdown: str, *, use_code_chunker: bool
|
||||||
) -> tuple[np.ndarray, list[ChunkPair]]:
|
) -> tuple[np.ndarray, list[ChunkPair]]:
|
||||||
if use_code_chunker:
|
chunk_texts = await chunk_markdown(markdown, use_code_chunker=use_code_chunker)
|
||||||
chunk_texts = await asyncio.to_thread(
|
embeddings = await embed_batch([markdown, *chunk_texts])
|
||||||
chunk_text, markdown, use_code_chunker=True
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Table-aware hybrid chunker keeps Markdown tables intact (issue #1334).
|
|
||||||
chunk_texts = await asyncio.to_thread(chunk_text_hybrid, markdown)
|
|
||||||
|
|
||||||
embeddings = await asyncio.to_thread(embed_texts, [markdown, *chunk_texts])
|
|
||||||
summary_embedding, *chunk_embeddings = embeddings
|
summary_embedding, *chunk_embeddings = embeddings
|
||||||
return summary_embedding, list(zip(chunk_texts, chunk_embeddings, strict=False))
|
return summary_embedding, list(zip(chunk_texts, chunk_embeddings, strict=False))
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue