From f82dedf712862b96e5748cbc3ae539ae5a071e0c Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 18:52:46 +0200 Subject: [PATCH] feat(indexing): add pure chunk reconciler for content-addressed diffs Greedy multiset match on chunk text decides which rows keep their embeddings, which texts need embedding, and which rows are deleted. No DB, no embeddings; fully unit-tested (reuse, head insert, middle edit, deletion, duplicates, reorder, full rewrite). --- .../app/indexing_pipeline/chunk_reconciler.py | 56 +++++++++++ .../test_chunk_reconciler.py | 94 +++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 surfsense_backend/app/indexing_pipeline/chunk_reconciler.py create mode 100644 surfsense_backend/tests/unit/indexing_pipeline/test_chunk_reconciler.py diff --git a/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py b/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py new file mode 100644 index 000000000..9354aeb9f --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py @@ -0,0 +1,56 @@ +"""Diff a document's existing chunk rows against its freshly chunked texts. + +Embeddings are a pure function of chunk text, so a row whose content reappears +in the new chunking keeps its embedding (and its HNSW/GIN index entries); only +genuinely new texts are embedded and only vanished rows are deleted. Matching +is a greedy multiset match on content in document order, so duplicate +boilerplate chunks pair up one-to-one and reordered chunks become cheap +position updates instead of delete+reinsert. +""" + +from __future__ import annotations + +from collections import defaultdict, deque +from dataclasses import dataclass + + +@dataclass(frozen=True, slots=True) +class ExistingChunk: + id: int + content: str + position: int + + +@dataclass(frozen=True, slots=True) +class ChunkPlan: + """The minimal set of writes that turns the stored chunks into the new ones. + + ``reused`` holds only kept rows whose position actually changed; rows that + match in place need no write at all. Kept-row count (for metrics) is + ``len(existing) - len(to_delete)``. + """ + + reused: list[tuple[int, int]] # (existing_chunk_id, new_position) + to_embed: list[tuple[int, str]] # (new_position, text) + to_delete: list[int] # existing chunk ids + + +def reconcile(existing: list[ExistingChunk], new_texts: list[str]) -> ChunkPlan: + available: dict[str, deque[ExistingChunk]] = defaultdict(deque) + for chunk in sorted(existing, key=lambda c: c.position): + available[chunk.content].append(chunk) + + reused: list[tuple[int, int]] = [] + to_embed: list[tuple[int, str]] = [] + + for new_position, text in enumerate(new_texts): + matches = available.get(text) + if matches: + chunk = matches.popleft() + if chunk.position != new_position: + reused.append((chunk.id, new_position)) + else: + to_embed.append((new_position, text)) + + to_delete = [chunk.id for queue in available.values() for chunk in queue] + return ChunkPlan(reused=reused, to_embed=to_embed, to_delete=to_delete) diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_reconciler.py b/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_reconciler.py new file mode 100644 index 000000000..7effce840 --- /dev/null +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_reconciler.py @@ -0,0 +1,94 @@ +"""reconcile(): diff existing chunk rows against new chunk texts. + +The reconciler decides which rows (and embeddings) survive an edit, which texts +must be embedded, and which rows go away -- purely from content, no DB. +""" + +from __future__ import annotations + +from app.indexing_pipeline.chunk_reconciler import ExistingChunk, reconcile + + +def _existing(*contents: str) -> list[ExistingChunk]: + return [ + ExistingChunk(id=i + 1, content=text, position=i) + for i, text in enumerate(contents) + ] + + +def test_identical_content_keeps_every_row_untouched(): + plan = reconcile(_existing("alpha", "beta", "gamma"), ["alpha", "beta", "gamma"]) + + assert plan.to_embed == [] + assert plan.to_delete == [] + assert plan.reused == [] + + +def test_head_insert_embeds_only_the_new_chunk_and_shifts_the_rest(): + plan = reconcile(_existing("alpha", "beta"), ["intro", "alpha", "beta"]) + + assert plan.to_embed == [(0, "intro")] + assert plan.to_delete == [] + # alpha: position 0 -> 1, beta: 1 -> 2; embeddings untouched. + assert plan.reused == [(1, 1), (2, 2)] + + +def test_middle_edit_swaps_exactly_one_chunk(): + plan = reconcile( + _existing("alpha", "beta", "gamma"), ["alpha", "beta EDITED", "gamma"] + ) + + assert plan.to_embed == [(1, "beta EDITED")] + assert plan.to_delete == [2] + # Neighbours did not move, so no position writes at all. + assert plan.reused == [] + + +def test_removed_chunk_is_deleted_and_followers_shift_up(): + plan = reconcile(_existing("alpha", "beta", "gamma"), ["alpha", "gamma"]) + + assert plan.to_embed == [] + assert plan.to_delete == [2] + assert plan.reused == [(3, 1)] + + +def test_duplicate_texts_pair_up_one_to_one(): + # Two identical boilerplate chunks, only one survives the edit: exactly one + # row is kept and exactly one is deleted -- never both kept or both dropped. + plan = reconcile(_existing("boiler", "boiler", "body"), ["boiler", "body"]) + + assert plan.to_embed == [] + assert plan.to_delete == [2] + assert plan.reused == [(3, 1)] + + +def test_duplicate_growth_embeds_only_the_extra_copy(): + plan = reconcile(_existing("boiler", "body"), ["boiler", "boiler", "body"]) + + assert plan.to_embed == [(1, "boiler")] + assert plan.to_delete == [] + assert plan.reused == [(2, 2)] + + +def test_reorder_becomes_position_updates_with_no_embedding(): + plan = reconcile(_existing("alpha", "beta"), ["beta", "alpha"]) + + assert plan.to_embed == [] + assert plan.to_delete == [] + assert sorted(plan.reused) == [(1, 1), (2, 0)] + + +def test_full_rewrite_replaces_everything(): + plan = reconcile(_existing("alpha", "beta"), ["new one", "new two"]) + + assert plan.to_embed == [(0, "new one"), (1, "new two")] + assert sorted(plan.to_delete) == [1, 2] + assert plan.reused == [] + + +def test_no_existing_chunks_embeds_all(): + plan = reconcile([], ["alpha", "beta"]) + + assert plan.to_embed == [(0, "alpha"), (1, "beta")] + assert plan.to_delete == [] + assert plan.reused == []