diff --git a/ts/EFFECT_NATIVE_REWRITE_AUDIT.md b/ts/EFFECT_NATIVE_REWRITE_AUDIT.md index 26b64c24..abdd30fd 100644 --- a/ts/EFFECT_NATIVE_REWRITE_AUDIT.md +++ b/ts/EFFECT_NATIVE_REWRITE_AUDIT.md @@ -2194,6 +2194,26 @@ Notes: - `cd ts && bun run lint` - `git diff --check` +### 2026-06-04: Chunking Chunk Collection Slice + +- Status: migrated and package-verified. +- Completed: + - `ts/packages/flow/src/chunking/recursive-splitter.ts` now returns + `Chunk.Chunk` and uses `effect/Chunk` for splitter result, merge, + recursive append, and overlap collections. + - The chunking service behavior is unchanged; `Chunk` is iterable and still + provides `length` for logging and output counting. + - Splitter and service tests convert `Chunk` results to readonly arrays only + at assertion boundaries. + - The focused chunking scan no longer has array-backed splitter result state. +- Verification: + - `cd ts/packages/flow && bunx --bun vitest run src/__tests__/recursive-splitter.test.ts src/__tests__/chunking-service.test.ts` + - `cd ts && bun run check:tsgo` + - `cd ts && bun run build` + - `cd ts && bun run test` + - `cd ts && bun run lint` + - `git diff --check` + ## Subagent Findings To Preserve - MCP/workbench: @@ -2342,9 +2362,9 @@ Notes: broker receive/error payload boundaries remain numeric milliseconds. - CLI modernization remains valid, but the live installed target is `effect/unstable/cli` rather than an installed `@effect/cli` package. - - Chunking remains a small valid `effect/Chunk` slice: the recursive - splitter is still array/mutation based and can expose `Chunk.Chunk` - internally while preserving service behavior. + - Chunking `effect/Chunk` migration is complete: the recursive splitter now + returns `Chunk.Chunk` and converts to arrays only at test/assertion + boundaries. - Knowledge core internals are largely Effect-native, but public core service facades still expose Promise methods; migrate tests to Effect-first methods before shrinking those facades. diff --git a/ts/packages/flow/src/__tests__/chunking-service.test.ts b/ts/packages/flow/src/__tests__/chunking-service.test.ts index 473679f3..aed6121e 100644 --- a/ts/packages/flow/src/__tests__/chunking-service.test.ts +++ b/ts/packages/flow/src/__tests__/chunking-service.test.ts @@ -1,5 +1,6 @@ import { describe, expect, it } from "@effect/vitest"; import { ConfigProvider, Effect, Fiber } from "effect"; +import * as EffectChunk from "effect/Chunk"; import { MessagingRuntimeLive, PubSub, @@ -212,7 +213,7 @@ describe("ChunkingService", () => { inputConsumer.push(createMessage(document, { id: "request-1" })); const outputProducer = backend.producersByTopic.get("chunk-output-topic") as RecordingProducer; - const expectedChunks = recursiveSplit(document.text, 18, 0); + const expectedChunks = EffectChunk.toReadonlyArray(recursiveSplit(document.text, 18, 0)); yield* waitFor(() => outputProducer.sent.length === expectedChunks.length, "chunk outputs"); expect(inputConsumer.acknowledged.length).toBe(1); diff --git a/ts/packages/flow/src/__tests__/recursive-splitter.test.ts b/ts/packages/flow/src/__tests__/recursive-splitter.test.ts index 7a36c0fc..8a326e25 100644 --- a/ts/packages/flow/src/__tests__/recursive-splitter.test.ts +++ b/ts/packages/flow/src/__tests__/recursive-splitter.test.ts @@ -1,26 +1,34 @@ import { describe, it, expect } from "vitest"; +import * as EffectChunk from "effect/Chunk"; import { recursiveSplit } from "../chunking/recursive-splitter.js"; +const splitToArray = ( + text: string, + chunkSize: number, + chunkOverlap: number, +): ReadonlyArray => + EffectChunk.toReadonlyArray(recursiveSplit(text, chunkSize, chunkOverlap)); + describe("recursiveSplit", () => { // ── Short text returns single chunk ────────────────────────────── it("returns single chunk when text is shorter than chunkSize", () => { - const result = recursiveSplit("Hello world", 100, 10); + const result = splitToArray("Hello world", 100, 10); expect(result).toEqual(["Hello world"]); }); // ── Empty/whitespace text returns empty array ──────────────────── it("returns empty array for empty string", () => { - expect(recursiveSplit("", 100, 10)).toEqual([]); + expect(splitToArray("", 100, 10)).toEqual([]); }); it("returns empty array for whitespace-only text", () => { - expect(recursiveSplit(" \n\n \n ", 100, 10)).toEqual([]); + expect(splitToArray(" \n\n \n ", 100, 10)).toEqual([]); }); // ── Splits on paragraph boundary (\n\n) first ─────────────────── it("splits on paragraph boundary (\\n\\n) first", () => { const text = "Paragraph one content here.\n\nParagraph two content here."; - const result = recursiveSplit(text, 30, 0); + const result = splitToArray(text, 30, 0); expect(result.length).toBeGreaterThanOrEqual(2); // Each chunk should contain content from its respective paragraph expect(result[0]).toContain("Paragraph one"); @@ -30,7 +38,7 @@ describe("recursiveSplit", () => { // ── Splits on \n when no \n\n present ──────────────────────────── it("splits on newline when no paragraph boundary present", () => { const text = "Line one content.\nLine two content.\nLine three content."; - const result = recursiveSplit(text, 25, 0); + const result = splitToArray(text, 25, 0); expect(result.length).toBeGreaterThanOrEqual(2); expect(result[0]).toContain("Line one"); }); @@ -38,7 +46,7 @@ describe("recursiveSplit", () => { // ── Splits on spaces when no newlines present ──────────────────── it("splits on spaces when no newlines present", () => { const text = "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10"; - const result = recursiveSplit(text, 20, 0); + const result = splitToArray(text, 20, 0); expect(result.length).toBeGreaterThanOrEqual(2); // Each chunk should be at most roughly chunkSize for (const chunk of result) { @@ -51,7 +59,7 @@ describe("recursiveSplit", () => { it("splits at character level as last resort", () => { // A single long word with no separators const text = "abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz"; - const result = recursiveSplit(text, 10, 0); + const result = splitToArray(text, 10, 0); expect(result.length).toBeGreaterThanOrEqual(2); // Reassembled text should match original expect(result.join("")).toBe(text); @@ -60,7 +68,7 @@ describe("recursiveSplit", () => { // ── Overlap: second chunk starts with tail of first ────────────── it("applies overlap so second chunk starts with tail of first", () => { const text = "First paragraph here.\n\nSecond paragraph here."; - const result = recursiveSplit(text, 25, 5); + const result = splitToArray(text, 25, 5); expect(result.length).toBeGreaterThanOrEqual(2); if (result.length >= 2) { // The second chunk should start with the last 5 chars of the first @@ -77,14 +85,14 @@ describe("recursiveSplit", () => { (_, i) => `This is paragraph number ${i + 1} with some filler content to make it longer.`, ); const text = paragraphs.join("\n\n"); - const result = recursiveSplit(text, 100, 10); + const result = splitToArray(text, 100, 10); expect(result.length).toBeGreaterThan(5); }); // ── chunkOverlap=0 produces no overlap ─────────────────────────── it("chunkOverlap=0 produces no overlap between chunks", () => { const text = "AAAA\n\nBBBB\n\nCCCC\n\nDDDD"; - const result = recursiveSplit(text, 8, 0); + const result = splitToArray(text, 8, 0); expect(result.length).toBeGreaterThanOrEqual(2); // With zero overlap, no chunk (except possibly the first) should start with previous chunk's tail for (let i = 1; i < result.length; i++) { diff --git a/ts/packages/flow/src/chunking/recursive-splitter.ts b/ts/packages/flow/src/chunking/recursive-splitter.ts index c0a997a1..bd2d9857 100644 --- a/ts/packages/flow/src/chunking/recursive-splitter.ts +++ b/ts/packages/flow/src/chunking/recursive-splitter.ts @@ -11,13 +11,15 @@ * Python reference: trustgraph-flow/trustgraph/chunking/recursive_splitter/service.py */ -const DEFAULT_SEPARATORS = ["\n\n", "\n", " ", ""]; +import * as Chunk from "effect/Chunk"; + +const DEFAULT_SEPARATORS: ReadonlyArray = ["\n\n", "\n", " ", ""]; export function recursiveSplit( text: string, chunkSize: number, chunkOverlap: number, -): string[] { +): Chunk.Chunk { return splitRecursive(text, chunkSize, chunkOverlap, DEFAULT_SEPARATORS); } @@ -25,10 +27,10 @@ function splitRecursive( text: string, chunkSize: number, chunkOverlap: number, - separators: string[], -): string[] { + separators: ReadonlyArray, +): Chunk.Chunk { if (text.length <= chunkSize) { - return text.trim().length > 0 ? [text] : []; + return text.trim().length > 0 ? Chunk.of(text) : Chunk.empty(); } // Find the best separator that exists in the text @@ -51,13 +53,13 @@ function splitRecursive( const merged = mergePieces(pieces, separator, chunkSize); // Recursively split oversized chunks with the next separator - const results: string[] = []; + let results = Chunk.empty(); for (const chunk of merged) { if (chunk.length > chunkSize && remainingSeparators.length > 0) { const subChunks = splitRecursive(chunk, chunkSize, chunkOverlap, remainingSeparators); - results.push(...subChunks); + results = Chunk.appendAll(results, subChunks); } else if (chunk.trim().length > 0) { - results.push(chunk); + results = Chunk.append(results, chunk); } } @@ -66,18 +68,18 @@ function splitRecursive( } function mergePieces( - pieces: string[], + pieces: ReadonlyArray, separator: string, chunkSize: number, -): string[] { - const chunks: string[] = []; +): Chunk.Chunk { + let chunks = Chunk.empty(); let current = ""; for (const piece of pieces) { const candidate = current.length > 0 ? current + separator + piece : piece; if (candidate.length > chunkSize && current.length > 0) { - chunks.push(current); + chunks = Chunk.append(chunks, current); current = piece; } else { current = candidate; @@ -85,21 +87,26 @@ function mergePieces( } if (current.length > 0) { - chunks.push(current); + chunks = Chunk.append(chunks, current); } return chunks; } -function applyOverlap(chunks: string[], overlapSize: number): string[] { +function applyOverlap(chunks: Chunk.Chunk, overlapSize: number): Chunk.Chunk { if (overlapSize <= 0 || chunks.length <= 1) return chunks; - const result: string[] = [chunks[0]]; + let result = Chunk.empty(); + let previous: string | undefined; - for (let i = 1; i < chunks.length; i++) { - const prev = chunks[i - 1]; - const overlapText = prev.slice(Math.max(0, prev.length - overlapSize)); - result.push(overlapText + chunks[i]); + for (const chunk of chunks) { + if (previous === undefined) { + result = Chunk.append(result, chunk); + } else { + const overlapText = previous.slice(Math.max(0, previous.length - overlapSize)); + result = Chunk.append(result, overlapText + chunk); + } + previous = chunk; } return result;