Use Chunk for recursive splitter results

This commit is contained in:
elpresidank 2026-06-04 07:37:03 -05:00
parent 1a29bdef9d
commit 157dd38df5
4 changed files with 69 additions and 33 deletions

View file

@ -2194,6 +2194,26 @@ Notes:
- `cd ts && bun run lint` - `cd ts && bun run lint`
- `git diff --check` - `git diff --check`
### 2026-06-04: Chunking Chunk Collection Slice
- Status: migrated and package-verified.
- Completed:
- `ts/packages/flow/src/chunking/recursive-splitter.ts` now returns
`Chunk.Chunk<string>` and uses `effect/Chunk` for splitter result, merge,
recursive append, and overlap collections.
- The chunking service behavior is unchanged; `Chunk` is iterable and still
provides `length` for logging and output counting.
- Splitter and service tests convert `Chunk` results to readonly arrays only
at assertion boundaries.
- The focused chunking scan no longer has array-backed splitter result state.
- Verification:
- `cd ts/packages/flow && bunx --bun vitest run src/__tests__/recursive-splitter.test.ts src/__tests__/chunking-service.test.ts`
- `cd ts && bun run check:tsgo`
- `cd ts && bun run build`
- `cd ts && bun run test`
- `cd ts && bun run lint`
- `git diff --check`
## Subagent Findings To Preserve ## Subagent Findings To Preserve
- MCP/workbench: - MCP/workbench:
@ -2342,9 +2362,9 @@ Notes:
broker receive/error payload boundaries remain numeric milliseconds. broker receive/error payload boundaries remain numeric milliseconds.
- CLI modernization remains valid, but the live installed target is - CLI modernization remains valid, but the live installed target is
`effect/unstable/cli` rather than an installed `@effect/cli` package. `effect/unstable/cli` rather than an installed `@effect/cli` package.
- Chunking remains a small valid `effect/Chunk` slice: the recursive - Chunking `effect/Chunk` migration is complete: the recursive splitter now
splitter is still array/mutation based and can expose `Chunk.Chunk<string>` returns `Chunk.Chunk<string>` and converts to arrays only at test/assertion
internally while preserving service behavior. boundaries.
- Knowledge core internals are largely Effect-native, but public core service - Knowledge core internals are largely Effect-native, but public core service
facades still expose Promise methods; migrate tests to Effect-first facades still expose Promise methods; migrate tests to Effect-first
methods before shrinking those facades. methods before shrinking those facades.

View file

@ -1,5 +1,6 @@
import { describe, expect, it } from "@effect/vitest"; import { describe, expect, it } from "@effect/vitest";
import { ConfigProvider, Effect, Fiber } from "effect"; import { ConfigProvider, Effect, Fiber } from "effect";
import * as EffectChunk from "effect/Chunk";
import { import {
MessagingRuntimeLive, MessagingRuntimeLive,
PubSub, PubSub,
@ -212,7 +213,7 @@ describe("ChunkingService", () => {
inputConsumer.push(createMessage(document, { id: "request-1" })); inputConsumer.push(createMessage(document, { id: "request-1" }));
const outputProducer = backend.producersByTopic.get("chunk-output-topic") as RecordingProducer<Chunk>; const outputProducer = backend.producersByTopic.get("chunk-output-topic") as RecordingProducer<Chunk>;
const expectedChunks = recursiveSplit(document.text, 18, 0); const expectedChunks = EffectChunk.toReadonlyArray(recursiveSplit(document.text, 18, 0));
yield* waitFor(() => outputProducer.sent.length === expectedChunks.length, "chunk outputs"); yield* waitFor(() => outputProducer.sent.length === expectedChunks.length, "chunk outputs");
expect(inputConsumer.acknowledged.length).toBe(1); expect(inputConsumer.acknowledged.length).toBe(1);

View file

@ -1,26 +1,34 @@
import { describe, it, expect } from "vitest"; import { describe, it, expect } from "vitest";
import * as EffectChunk from "effect/Chunk";
import { recursiveSplit } from "../chunking/recursive-splitter.js"; import { recursiveSplit } from "../chunking/recursive-splitter.js";
const splitToArray = (
text: string,
chunkSize: number,
chunkOverlap: number,
): ReadonlyArray<string> =>
EffectChunk.toReadonlyArray(recursiveSplit(text, chunkSize, chunkOverlap));
describe("recursiveSplit", () => { describe("recursiveSplit", () => {
// ── Short text returns single chunk ────────────────────────────── // ── Short text returns single chunk ──────────────────────────────
it("returns single chunk when text is shorter than chunkSize", () => { it("returns single chunk when text is shorter than chunkSize", () => {
const result = recursiveSplit("Hello world", 100, 10); const result = splitToArray("Hello world", 100, 10);
expect(result).toEqual(["Hello world"]); expect(result).toEqual(["Hello world"]);
}); });
// ── Empty/whitespace text returns empty array ──────────────────── // ── Empty/whitespace text returns empty array ────────────────────
it("returns empty array for empty string", () => { it("returns empty array for empty string", () => {
expect(recursiveSplit("", 100, 10)).toEqual([]); expect(splitToArray("", 100, 10)).toEqual([]);
}); });
it("returns empty array for whitespace-only text", () => { it("returns empty array for whitespace-only text", () => {
expect(recursiveSplit(" \n\n \n ", 100, 10)).toEqual([]); expect(splitToArray(" \n\n \n ", 100, 10)).toEqual([]);
}); });
// ── Splits on paragraph boundary (\n\n) first ─────────────────── // ── Splits on paragraph boundary (\n\n) first ───────────────────
it("splits on paragraph boundary (\\n\\n) first", () => { it("splits on paragraph boundary (\\n\\n) first", () => {
const text = "Paragraph one content here.\n\nParagraph two content here."; const text = "Paragraph one content here.\n\nParagraph two content here.";
const result = recursiveSplit(text, 30, 0); const result = splitToArray(text, 30, 0);
expect(result.length).toBeGreaterThanOrEqual(2); expect(result.length).toBeGreaterThanOrEqual(2);
// Each chunk should contain content from its respective paragraph // Each chunk should contain content from its respective paragraph
expect(result[0]).toContain("Paragraph one"); expect(result[0]).toContain("Paragraph one");
@ -30,7 +38,7 @@ describe("recursiveSplit", () => {
// ── Splits on \n when no \n\n present ──────────────────────────── // ── Splits on \n when no \n\n present ────────────────────────────
it("splits on newline when no paragraph boundary present", () => { it("splits on newline when no paragraph boundary present", () => {
const text = "Line one content.\nLine two content.\nLine three content."; const text = "Line one content.\nLine two content.\nLine three content.";
const result = recursiveSplit(text, 25, 0); const result = splitToArray(text, 25, 0);
expect(result.length).toBeGreaterThanOrEqual(2); expect(result.length).toBeGreaterThanOrEqual(2);
expect(result[0]).toContain("Line one"); expect(result[0]).toContain("Line one");
}); });
@ -38,7 +46,7 @@ describe("recursiveSplit", () => {
// ── Splits on spaces when no newlines present ──────────────────── // ── Splits on spaces when no newlines present ────────────────────
it("splits on spaces when no newlines present", () => { it("splits on spaces when no newlines present", () => {
const text = "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10"; const text = "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10";
const result = recursiveSplit(text, 20, 0); const result = splitToArray(text, 20, 0);
expect(result.length).toBeGreaterThanOrEqual(2); expect(result.length).toBeGreaterThanOrEqual(2);
// Each chunk should be at most roughly chunkSize // Each chunk should be at most roughly chunkSize
for (const chunk of result) { for (const chunk of result) {
@ -51,7 +59,7 @@ describe("recursiveSplit", () => {
it("splits at character level as last resort", () => { it("splits at character level as last resort", () => {
// A single long word with no separators // A single long word with no separators
const text = "abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz"; const text = "abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz";
const result = recursiveSplit(text, 10, 0); const result = splitToArray(text, 10, 0);
expect(result.length).toBeGreaterThanOrEqual(2); expect(result.length).toBeGreaterThanOrEqual(2);
// Reassembled text should match original // Reassembled text should match original
expect(result.join("")).toBe(text); expect(result.join("")).toBe(text);
@ -60,7 +68,7 @@ describe("recursiveSplit", () => {
// ── Overlap: second chunk starts with tail of first ────────────── // ── Overlap: second chunk starts with tail of first ──────────────
it("applies overlap so second chunk starts with tail of first", () => { it("applies overlap so second chunk starts with tail of first", () => {
const text = "First paragraph here.\n\nSecond paragraph here."; const text = "First paragraph here.\n\nSecond paragraph here.";
const result = recursiveSplit(text, 25, 5); const result = splitToArray(text, 25, 5);
expect(result.length).toBeGreaterThanOrEqual(2); expect(result.length).toBeGreaterThanOrEqual(2);
if (result.length >= 2) { if (result.length >= 2) {
// The second chunk should start with the last 5 chars of the first // The second chunk should start with the last 5 chars of the first
@ -77,14 +85,14 @@ describe("recursiveSplit", () => {
(_, i) => `This is paragraph number ${i + 1} with some filler content to make it longer.`, (_, i) => `This is paragraph number ${i + 1} with some filler content to make it longer.`,
); );
const text = paragraphs.join("\n\n"); const text = paragraphs.join("\n\n");
const result = recursiveSplit(text, 100, 10); const result = splitToArray(text, 100, 10);
expect(result.length).toBeGreaterThan(5); expect(result.length).toBeGreaterThan(5);
}); });
// ── chunkOverlap=0 produces no overlap ─────────────────────────── // ── chunkOverlap=0 produces no overlap ───────────────────────────
it("chunkOverlap=0 produces no overlap between chunks", () => { it("chunkOverlap=0 produces no overlap between chunks", () => {
const text = "AAAA\n\nBBBB\n\nCCCC\n\nDDDD"; const text = "AAAA\n\nBBBB\n\nCCCC\n\nDDDD";
const result = recursiveSplit(text, 8, 0); const result = splitToArray(text, 8, 0);
expect(result.length).toBeGreaterThanOrEqual(2); expect(result.length).toBeGreaterThanOrEqual(2);
// With zero overlap, no chunk (except possibly the first) should start with previous chunk's tail // With zero overlap, no chunk (except possibly the first) should start with previous chunk's tail
for (let i = 1; i < result.length; i++) { for (let i = 1; i < result.length; i++) {

View file

@ -11,13 +11,15 @@
* Python reference: trustgraph-flow/trustgraph/chunking/recursive_splitter/service.py * Python reference: trustgraph-flow/trustgraph/chunking/recursive_splitter/service.py
*/ */
const DEFAULT_SEPARATORS = ["\n\n", "\n", " ", ""]; import * as Chunk from "effect/Chunk";
const DEFAULT_SEPARATORS: ReadonlyArray<string> = ["\n\n", "\n", " ", ""];
export function recursiveSplit( export function recursiveSplit(
text: string, text: string,
chunkSize: number, chunkSize: number,
chunkOverlap: number, chunkOverlap: number,
): string[] { ): Chunk.Chunk<string> {
return splitRecursive(text, chunkSize, chunkOverlap, DEFAULT_SEPARATORS); return splitRecursive(text, chunkSize, chunkOverlap, DEFAULT_SEPARATORS);
} }
@ -25,10 +27,10 @@ function splitRecursive(
text: string, text: string,
chunkSize: number, chunkSize: number,
chunkOverlap: number, chunkOverlap: number,
separators: string[], separators: ReadonlyArray<string>,
): string[] { ): Chunk.Chunk<string> {
if (text.length <= chunkSize) { if (text.length <= chunkSize) {
return text.trim().length > 0 ? [text] : []; return text.trim().length > 0 ? Chunk.of(text) : Chunk.empty();
} }
// Find the best separator that exists in the text // Find the best separator that exists in the text
@ -51,13 +53,13 @@ function splitRecursive(
const merged = mergePieces(pieces, separator, chunkSize); const merged = mergePieces(pieces, separator, chunkSize);
// Recursively split oversized chunks with the next separator // Recursively split oversized chunks with the next separator
const results: string[] = []; let results = Chunk.empty<string>();
for (const chunk of merged) { for (const chunk of merged) {
if (chunk.length > chunkSize && remainingSeparators.length > 0) { if (chunk.length > chunkSize && remainingSeparators.length > 0) {
const subChunks = splitRecursive(chunk, chunkSize, chunkOverlap, remainingSeparators); const subChunks = splitRecursive(chunk, chunkSize, chunkOverlap, remainingSeparators);
results.push(...subChunks); results = Chunk.appendAll(results, subChunks);
} else if (chunk.trim().length > 0) { } else if (chunk.trim().length > 0) {
results.push(chunk); results = Chunk.append(results, chunk);
} }
} }
@ -66,18 +68,18 @@ function splitRecursive(
} }
function mergePieces( function mergePieces(
pieces: string[], pieces: ReadonlyArray<string>,
separator: string, separator: string,
chunkSize: number, chunkSize: number,
): string[] { ): Chunk.Chunk<string> {
const chunks: string[] = []; let chunks = Chunk.empty<string>();
let current = ""; let current = "";
for (const piece of pieces) { for (const piece of pieces) {
const candidate = current.length > 0 ? current + separator + piece : piece; const candidate = current.length > 0 ? current + separator + piece : piece;
if (candidate.length > chunkSize && current.length > 0) { if (candidate.length > chunkSize && current.length > 0) {
chunks.push(current); chunks = Chunk.append(chunks, current);
current = piece; current = piece;
} else { } else {
current = candidate; current = candidate;
@ -85,21 +87,26 @@ function mergePieces(
} }
if (current.length > 0) { if (current.length > 0) {
chunks.push(current); chunks = Chunk.append(chunks, current);
} }
return chunks; return chunks;
} }
function applyOverlap(chunks: string[], overlapSize: number): string[] { function applyOverlap(chunks: Chunk.Chunk<string>, overlapSize: number): Chunk.Chunk<string> {
if (overlapSize <= 0 || chunks.length <= 1) return chunks; if (overlapSize <= 0 || chunks.length <= 1) return chunks;
const result: string[] = [chunks[0]]; let result = Chunk.empty<string>();
let previous: string | undefined;
for (let i = 1; i < chunks.length; i++) { for (const chunk of chunks) {
const prev = chunks[i - 1]; if (previous === undefined) {
const overlapText = prev.slice(Math.max(0, prev.length - overlapSize)); result = Chunk.append(result, chunk);
result.push(overlapText + chunks[i]); } else {
const overlapText = previous.slice(Math.max(0, previous.length - overlapSize));
result = Chunk.append(result, overlapText + chunk);
}
previous = chunk;
} }
return result; return result;