Use Chunk for recursive splitter results

This commit is contained in:
elpresidank 2026-06-04 07:37:03 -05:00
parent 1a29bdef9d
commit 157dd38df5
4 changed files with 69 additions and 33 deletions

View file

@ -2194,6 +2194,26 @@ Notes:
- `cd ts && bun run lint`
- `git diff --check`
### 2026-06-04: Chunking Chunk Collection Slice
- Status: migrated and package-verified.
- Completed:
- `ts/packages/flow/src/chunking/recursive-splitter.ts` now returns
`Chunk.Chunk<string>` and uses `effect/Chunk` for splitter result, merge,
recursive append, and overlap collections.
- The chunking service behavior is unchanged; `Chunk` is iterable and still
provides `length` for logging and output counting.
- Splitter and service tests convert `Chunk` results to readonly arrays only
at assertion boundaries.
- The focused chunking scan no longer has array-backed splitter result state.
- Verification:
- `cd ts/packages/flow && bunx --bun vitest run src/__tests__/recursive-splitter.test.ts src/__tests__/chunking-service.test.ts`
- `cd ts && bun run check:tsgo`
- `cd ts && bun run build`
- `cd ts && bun run test`
- `cd ts && bun run lint`
- `git diff --check`
## Subagent Findings To Preserve
- MCP/workbench:
@ -2342,9 +2362,9 @@ Notes:
broker receive/error payload boundaries remain numeric milliseconds.
- CLI modernization remains valid, but the live installed target is
`effect/unstable/cli` rather than an installed `@effect/cli` package.
- Chunking remains a small valid `effect/Chunk` slice: the recursive
splitter is still array/mutation based and can expose `Chunk.Chunk<string>`
internally while preserving service behavior.
- Chunking `effect/Chunk` migration is complete: the recursive splitter now
returns `Chunk.Chunk<string>` and converts to arrays only at test/assertion
boundaries.
- Knowledge core internals are largely Effect-native, but public core service
facades still expose Promise methods; migrate tests to Effect-first
methods before shrinking those facades.

View file

@ -1,5 +1,6 @@
import { describe, expect, it } from "@effect/vitest";
import { ConfigProvider, Effect, Fiber } from "effect";
import * as EffectChunk from "effect/Chunk";
import {
MessagingRuntimeLive,
PubSub,
@ -212,7 +213,7 @@ describe("ChunkingService", () => {
inputConsumer.push(createMessage(document, { id: "request-1" }));
const outputProducer = backend.producersByTopic.get("chunk-output-topic") as RecordingProducer<Chunk>;
const expectedChunks = recursiveSplit(document.text, 18, 0);
const expectedChunks = EffectChunk.toReadonlyArray(recursiveSplit(document.text, 18, 0));
yield* waitFor(() => outputProducer.sent.length === expectedChunks.length, "chunk outputs");
expect(inputConsumer.acknowledged.length).toBe(1);

View file

@ -1,26 +1,34 @@
import { describe, it, expect } from "vitest";
import * as EffectChunk from "effect/Chunk";
import { recursiveSplit } from "../chunking/recursive-splitter.js";
const splitToArray = (
text: string,
chunkSize: number,
chunkOverlap: number,
): ReadonlyArray<string> =>
EffectChunk.toReadonlyArray(recursiveSplit(text, chunkSize, chunkOverlap));
describe("recursiveSplit", () => {
// ── Short text returns single chunk ──────────────────────────────
it("returns single chunk when text is shorter than chunkSize", () => {
const result = recursiveSplit("Hello world", 100, 10);
const result = splitToArray("Hello world", 100, 10);
expect(result).toEqual(["Hello world"]);
});
// ── Empty/whitespace text returns empty array ────────────────────
it("returns empty array for empty string", () => {
expect(recursiveSplit("", 100, 10)).toEqual([]);
expect(splitToArray("", 100, 10)).toEqual([]);
});
it("returns empty array for whitespace-only text", () => {
expect(recursiveSplit(" \n\n \n ", 100, 10)).toEqual([]);
expect(splitToArray(" \n\n \n ", 100, 10)).toEqual([]);
});
// ── Splits on paragraph boundary (\n\n) first ───────────────────
it("splits on paragraph boundary (\\n\\n) first", () => {
const text = "Paragraph one content here.\n\nParagraph two content here.";
const result = recursiveSplit(text, 30, 0);
const result = splitToArray(text, 30, 0);
expect(result.length).toBeGreaterThanOrEqual(2);
// Each chunk should contain content from its respective paragraph
expect(result[0]).toContain("Paragraph one");
@ -30,7 +38,7 @@ describe("recursiveSplit", () => {
// ── Splits on \n when no \n\n present ────────────────────────────
it("splits on newline when no paragraph boundary present", () => {
const text = "Line one content.\nLine two content.\nLine three content.";
const result = recursiveSplit(text, 25, 0);
const result = splitToArray(text, 25, 0);
expect(result.length).toBeGreaterThanOrEqual(2);
expect(result[0]).toContain("Line one");
});
@ -38,7 +46,7 @@ describe("recursiveSplit", () => {
// ── Splits on spaces when no newlines present ────────────────────
it("splits on spaces when no newlines present", () => {
const text = "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10";
const result = recursiveSplit(text, 20, 0);
const result = splitToArray(text, 20, 0);
expect(result.length).toBeGreaterThanOrEqual(2);
// Each chunk should be at most roughly chunkSize
for (const chunk of result) {
@ -51,7 +59,7 @@ describe("recursiveSplit", () => {
it("splits at character level as last resort", () => {
// A single long word with no separators
const text = "abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz";
const result = recursiveSplit(text, 10, 0);
const result = splitToArray(text, 10, 0);
expect(result.length).toBeGreaterThanOrEqual(2);
// Reassembled text should match original
expect(result.join("")).toBe(text);
@ -60,7 +68,7 @@ describe("recursiveSplit", () => {
// ── Overlap: second chunk starts with tail of first ──────────────
it("applies overlap so second chunk starts with tail of first", () => {
const text = "First paragraph here.\n\nSecond paragraph here.";
const result = recursiveSplit(text, 25, 5);
const result = splitToArray(text, 25, 5);
expect(result.length).toBeGreaterThanOrEqual(2);
if (result.length >= 2) {
// The second chunk should start with the last 5 chars of the first
@ -77,14 +85,14 @@ describe("recursiveSplit", () => {
(_, i) => `This is paragraph number ${i + 1} with some filler content to make it longer.`,
);
const text = paragraphs.join("\n\n");
const result = recursiveSplit(text, 100, 10);
const result = splitToArray(text, 100, 10);
expect(result.length).toBeGreaterThan(5);
});
// ── chunkOverlap=0 produces no overlap ───────────────────────────
it("chunkOverlap=0 produces no overlap between chunks", () => {
const text = "AAAA\n\nBBBB\n\nCCCC\n\nDDDD";
const result = recursiveSplit(text, 8, 0);
const result = splitToArray(text, 8, 0);
expect(result.length).toBeGreaterThanOrEqual(2);
// With zero overlap, no chunk (except possibly the first) should start with previous chunk's tail
for (let i = 1; i < result.length; i++) {

View file

@ -11,13 +11,15 @@
* Python reference: trustgraph-flow/trustgraph/chunking/recursive_splitter/service.py
*/
const DEFAULT_SEPARATORS = ["\n\n", "\n", " ", ""];
import * as Chunk from "effect/Chunk";
const DEFAULT_SEPARATORS: ReadonlyArray<string> = ["\n\n", "\n", " ", ""];
export function recursiveSplit(
text: string,
chunkSize: number,
chunkOverlap: number,
): string[] {
): Chunk.Chunk<string> {
return splitRecursive(text, chunkSize, chunkOverlap, DEFAULT_SEPARATORS);
}
@ -25,10 +27,10 @@ function splitRecursive(
text: string,
chunkSize: number,
chunkOverlap: number,
separators: string[],
): string[] {
separators: ReadonlyArray<string>,
): Chunk.Chunk<string> {
if (text.length <= chunkSize) {
return text.trim().length > 0 ? [text] : [];
return text.trim().length > 0 ? Chunk.of(text) : Chunk.empty();
}
// Find the best separator that exists in the text
@ -51,13 +53,13 @@ function splitRecursive(
const merged = mergePieces(pieces, separator, chunkSize);
// Recursively split oversized chunks with the next separator
const results: string[] = [];
let results = Chunk.empty<string>();
for (const chunk of merged) {
if (chunk.length > chunkSize && remainingSeparators.length > 0) {
const subChunks = splitRecursive(chunk, chunkSize, chunkOverlap, remainingSeparators);
results.push(...subChunks);
results = Chunk.appendAll(results, subChunks);
} else if (chunk.trim().length > 0) {
results.push(chunk);
results = Chunk.append(results, chunk);
}
}
@ -66,18 +68,18 @@ function splitRecursive(
}
function mergePieces(
pieces: string[],
pieces: ReadonlyArray<string>,
separator: string,
chunkSize: number,
): string[] {
const chunks: string[] = [];
): Chunk.Chunk<string> {
let chunks = Chunk.empty<string>();
let current = "";
for (const piece of pieces) {
const candidate = current.length > 0 ? current + separator + piece : piece;
if (candidate.length > chunkSize && current.length > 0) {
chunks.push(current);
chunks = Chunk.append(chunks, current);
current = piece;
} else {
current = candidate;
@ -85,21 +87,26 @@ function mergePieces(
}
if (current.length > 0) {
chunks.push(current);
chunks = Chunk.append(chunks, current);
}
return chunks;
}
function applyOverlap(chunks: string[], overlapSize: number): string[] {
function applyOverlap(chunks: Chunk.Chunk<string>, overlapSize: number): Chunk.Chunk<string> {
if (overlapSize <= 0 || chunks.length <= 1) return chunks;
const result: string[] = [chunks[0]];
let result = Chunk.empty<string>();
let previous: string | undefined;
for (let i = 1; i < chunks.length; i++) {
const prev = chunks[i - 1];
const overlapText = prev.slice(Math.max(0, prev.length - overlapSize));
result.push(overlapText + chunks[i]);
for (const chunk of chunks) {
if (previous === undefined) {
result = Chunk.append(result, chunk);
} else {
const overlapText = previous.slice(Math.max(0, previous.length - overlapSize));
result = Chunk.append(result, overlapText + chunk);
}
previous = chunk;
}
return result;