Use Chunk for recursive splitter results

2026-07-01 01:19:38 +02:00 · 2026-06-04 07:37:03 -05:00 · 2026-06-04 07:37:03 -05:00 · 157dd38df5
commit 157dd38df5
parent 1a29bdef9d
4 changed files with 69 additions and 33 deletions
--- a/ts/EFFECT_NATIVE_REWRITE_AUDIT.md
+++ b/ts/EFFECT_NATIVE_REWRITE_AUDIT.md
@ -2194,6 +2194,26 @@ Notes:
  - `cd ts && bun run lint`
  - `git diff --check`

+### 2026-06-04: Chunking Chunk Collection Slice
+
+- Status: migrated and package-verified.
+- Completed:
+  - `ts/packages/flow/src/chunking/recursive-splitter.ts` now returns
+    `Chunk.Chunk<string>` and uses `effect/Chunk` for splitter result, merge,
+    recursive append, and overlap collections.
+  - The chunking service behavior is unchanged; `Chunk` is iterable and still
+    provides `length` for logging and output counting.
+  - Splitter and service tests convert `Chunk` results to readonly arrays only
+    at assertion boundaries.
+  - The focused chunking scan no longer has array-backed splitter result state.
+- Verification:
+  - `cd ts/packages/flow && bunx --bun vitest run src/__tests__/recursive-splitter.test.ts src/__tests__/chunking-service.test.ts`
+  - `cd ts && bun run check:tsgo`
+  - `cd ts && bun run build`
+  - `cd ts && bun run test`
+  - `cd ts && bun run lint`
+  - `git diff --check`
+
 ## Subagent Findings To Preserve

 - MCP/workbench:
@ -2342,9 +2362,9 @@ Notes:
    broker receive/error payload boundaries remain numeric milliseconds.
  - CLI modernization remains valid, but the live installed target is
    `effect/unstable/cli` rather than an installed `@effect/cli` package.
-  - Chunking remains a small valid `effect/Chunk` slice: the recursive
-    splitter is still array/mutation based and can expose `Chunk.Chunk<string>`
-    internally while preserving service behavior.
+  - Chunking `effect/Chunk` migration is complete: the recursive splitter now
+    returns `Chunk.Chunk<string>` and converts to arrays only at test/assertion
+    boundaries.
  - Knowledge core internals are largely Effect-native, but public core service
    facades still expose Promise methods; migrate tests to Effect-first
    methods before shrinking those facades.
--- a/ts/packages/flow/src/tests/chunking-service.test.ts
+++ b/ts/packages/flow/src/tests/chunking-service.test.ts
@ -1,5 +1,6 @@
 import { describe, expect, it } from "@effect/vitest";
 import { ConfigProvider, Effect, Fiber } from "effect";
+import * as EffectChunk from "effect/Chunk";
 import {
  MessagingRuntimeLive,
  PubSub,
@ -212,7 +213,7 @@ describe("ChunkingService", () => {
          inputConsumer.push(createMessage(document, { id: "request-1" }));

          const outputProducer = backend.producersByTopic.get("chunk-output-topic") as RecordingProducer<Chunk>;
-          const expectedChunks = recursiveSplit(document.text, 18, 0);
+          const expectedChunks = EffectChunk.toReadonlyArray(recursiveSplit(document.text, 18, 0));
          yield* waitFor(() => outputProducer.sent.length === expectedChunks.length, "chunk outputs");

          expect(inputConsumer.acknowledged.length).toBe(1);
--- a/ts/packages/flow/src/tests/recursive-splitter.test.ts
+++ b/ts/packages/flow/src/tests/recursive-splitter.test.ts
@ -1,26 +1,34 @@
 import { describe, it, expect } from "vitest";
+import * as EffectChunk from "effect/Chunk";
 import { recursiveSplit } from "../chunking/recursive-splitter.js";

+const splitToArray = (
+  text: string,
+  chunkSize: number,
+  chunkOverlap: number,
+): ReadonlyArray<string> =>
+  EffectChunk.toReadonlyArray(recursiveSplit(text, chunkSize, chunkOverlap));
+
 describe("recursiveSplit", () => {
  // ── Short text returns single chunk ──────────────────────────────
  it("returns single chunk when text is shorter than chunkSize", () => {
-    const result = recursiveSplit("Hello world", 100, 10);
+    const result = splitToArray("Hello world", 100, 10);
    expect(result).toEqual(["Hello world"]);
  });

  // ── Empty/whitespace text returns empty array ────────────────────
  it("returns empty array for empty string", () => {
-    expect(recursiveSplit("", 100, 10)).toEqual([]);
+    expect(splitToArray("", 100, 10)).toEqual([]);
  });

  it("returns empty array for whitespace-only text", () => {
-    expect(recursiveSplit("   \n\n  \n  ", 100, 10)).toEqual([]);
+    expect(splitToArray("   \n\n  \n  ", 100, 10)).toEqual([]);
  });

  // ── Splits on paragraph boundary (\n\n) first ───────────────────
  it("splits on paragraph boundary (\\n\\n) first", () => {
    const text = "Paragraph one content here.\n\nParagraph two content here.";
-    const result = recursiveSplit(text, 30, 0);
+    const result = splitToArray(text, 30, 0);
    expect(result.length).toBeGreaterThanOrEqual(2);
    // Each chunk should contain content from its respective paragraph
    expect(result[0]).toContain("Paragraph one");
@ -30,7 +38,7 @@ describe("recursiveSplit", () => {
  // ── Splits on \n when no \n\n present ────────────────────────────
  it("splits on newline when no paragraph boundary present", () => {
    const text = "Line one content.\nLine two content.\nLine three content.";
-    const result = recursiveSplit(text, 25, 0);
+    const result = splitToArray(text, 25, 0);
    expect(result.length).toBeGreaterThanOrEqual(2);
    expect(result[0]).toContain("Line one");
  });
@ -38,7 +46,7 @@ describe("recursiveSplit", () => {
  // ── Splits on spaces when no newlines present ────────────────────
  it("splits on spaces when no newlines present", () => {
    const text = "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10";
-    const result = recursiveSplit(text, 20, 0);
+    const result = splitToArray(text, 20, 0);
    expect(result.length).toBeGreaterThanOrEqual(2);
    // Each chunk should be at most roughly chunkSize
    for (const chunk of result) {
@ -51,7 +59,7 @@ describe("recursiveSplit", () => {
  it("splits at character level as last resort", () => {
    // A single long word with no separators
    const text = "abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz";
-    const result = recursiveSplit(text, 10, 0);
+    const result = splitToArray(text, 10, 0);
    expect(result.length).toBeGreaterThanOrEqual(2);
    // Reassembled text should match original
    expect(result.join("")).toBe(text);
@ -60,7 +68,7 @@ describe("recursiveSplit", () => {
  // ── Overlap: second chunk starts with tail of first ──────────────
  it("applies overlap so second chunk starts with tail of first", () => {
    const text = "First paragraph here.\n\nSecond paragraph here.";
-    const result = recursiveSplit(text, 25, 5);
+    const result = splitToArray(text, 25, 5);
    expect(result.length).toBeGreaterThanOrEqual(2);
    if (result.length >= 2) {
      // The second chunk should start with the last 5 chars of the first
@ -77,14 +85,14 @@ describe("recursiveSplit", () => {
      (_, i) => `This is paragraph number ${i + 1} with some filler content to make it longer.`,
    );
    const text = paragraphs.join("\n\n");
-    const result = recursiveSplit(text, 100, 10);
+    const result = splitToArray(text, 100, 10);
    expect(result.length).toBeGreaterThan(5);
  });

  // ── chunkOverlap=0 produces no overlap ───────────────────────────
  it("chunkOverlap=0 produces no overlap between chunks", () => {
    const text = "AAAA\n\nBBBB\n\nCCCC\n\nDDDD";
-    const result = recursiveSplit(text, 8, 0);
+    const result = splitToArray(text, 8, 0);
    expect(result.length).toBeGreaterThanOrEqual(2);
    // With zero overlap, no chunk (except possibly the first) should start with previous chunk's tail
    for (let i = 1; i < result.length; i++) {
--- a/ts/packages/flow/src/chunking/recursive-splitter.ts
+++ b/ts/packages/flow/src/chunking/recursive-splitter.ts
@ -11,13 +11,15 @@
 * Python reference: trustgraph-flow/trustgraph/chunking/recursive_splitter/service.py
 */

-const DEFAULT_SEPARATORS = ["\n\n", "\n", " ", ""];
+import * as Chunk from "effect/Chunk";
+
+const DEFAULT_SEPARATORS: ReadonlyArray<string> = ["\n\n", "\n", " ", ""];

 export function recursiveSplit(
  text: string,
  chunkSize: number,
  chunkOverlap: number,
-): string[] {
+): Chunk.Chunk<string> {
  return splitRecursive(text, chunkSize, chunkOverlap, DEFAULT_SEPARATORS);
 }

@ -25,10 +27,10 @@ function splitRecursive(
  text: string,
  chunkSize: number,
  chunkOverlap: number,
-  separators: string[],
-): string[] {
+  separators: ReadonlyArray<string>,
+): Chunk.Chunk<string> {
  if (text.length <= chunkSize) {
-    return text.trim().length > 0 ? [text] : [];
+    return text.trim().length > 0 ? Chunk.of(text) : Chunk.empty();
  }

  // Find the best separator that exists in the text
@ -51,13 +53,13 @@ function splitRecursive(
  const merged = mergePieces(pieces, separator, chunkSize);

  // Recursively split oversized chunks with the next separator
-  const results: string[] = [];
+  let results = Chunk.empty<string>();
  for (const chunk of merged) {
    if (chunk.length > chunkSize && remainingSeparators.length > 0) {
      const subChunks = splitRecursive(chunk, chunkSize, chunkOverlap, remainingSeparators);
-      results.push(...subChunks);
+      results = Chunk.appendAll(results, subChunks);
    } else if (chunk.trim().length > 0) {
-      results.push(chunk);
+      results = Chunk.append(results, chunk);
    }
  }

@ -66,18 +68,18 @@ function splitRecursive(
 }

 function mergePieces(
-  pieces: string[],
+  pieces: ReadonlyArray<string>,
  separator: string,
  chunkSize: number,
-): string[] {
-  const chunks: string[] = [];
+): Chunk.Chunk<string> {
+  let chunks = Chunk.empty<string>();
  let current = "";

  for (const piece of pieces) {
    const candidate = current.length > 0 ? current + separator + piece : piece;

    if (candidate.length > chunkSize && current.length > 0) {
-      chunks.push(current);
+      chunks = Chunk.append(chunks, current);
      current = piece;
    } else {
      current = candidate;
@ -85,21 +87,26 @@ function mergePieces(
  }

  if (current.length > 0) {
-    chunks.push(current);
+    chunks = Chunk.append(chunks, current);
  }

  return chunks;
 }

-function applyOverlap(chunks: string[], overlapSize: number): string[] {
+function applyOverlap(chunks: Chunk.Chunk<string>, overlapSize: number): Chunk.Chunk<string> {
  if (overlapSize <= 0 || chunks.length <= 1) return chunks;

-  const result: string[] = [chunks[0]];
+  let result = Chunk.empty<string>();
+  let previous: string | undefined;

-  for (let i = 1; i < chunks.length; i++) {
-    const prev = chunks[i - 1];
-    const overlapText = prev.slice(Math.max(0, prev.length - overlapSize));
-    result.push(overlapText + chunks[i]);
+  for (const chunk of chunks) {
+    if (previous === undefined) {
+      result = Chunk.append(result, chunk);
+    } else {
+      const overlapText = previous.slice(Math.max(0, previous.length - overlapSize));
+      result = Chunk.append(result, overlapText + chunk);
+    }
+    previous = chunk;
  }

  return result;