From 03012c307729ee98e0527ba56c1f08d67590bc49 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 18 Jun 2026 20:06:33 +0200 Subject: [PATCH] test: span-aware paragraph chunker fixture --- .../indexing_pipeline/test_index_editions.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py b/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py index 68d5ec0af..f86ee8e4f 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py @@ -18,16 +18,22 @@ _V1 = "Intro paragraph.\n\nBody paragraph.\n\nOutro paragraph." @pytest.fixture def paragraph_chunker(monkeypatch): - """One chunk per markdown paragraph, so edits map to chunk-level diffs.""" + """One slice per markdown paragraph, so edits map to chunk-level diffs.""" + from app.indexing_pipeline.document_chunker import ChunkSlice - def _split(markdown, **_kwargs): - return [p for p in markdown.split("\n\n") if p.strip()] + def _split(markdown, *_args, **_kwargs): + slices = [] + cursor = 0 + for para in markdown.split("\n\n"): + start = markdown.index(para, cursor) + cursor = start + len(para) + if para.strip(): + slices.append(ChunkSlice(para, start, cursor)) + return slices monkeypatch.setattr( - "app.indexing_pipeline.cache.cached_indexing.chunk_text", _split - ) - monkeypatch.setattr( - "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", _split + "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans", + _split, )