mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-07-01 01:19:38 +02:00
Use Chunk for recursive splitter results
This commit is contained in:
parent
1a29bdef9d
commit
157dd38df5
4 changed files with 69 additions and 33 deletions
|
|
@ -2194,6 +2194,26 @@ Notes:
|
|||
- `cd ts && bun run lint`
|
||||
- `git diff --check`
|
||||
|
||||
### 2026-06-04: Chunking Chunk Collection Slice
|
||||
|
||||
- Status: migrated and package-verified.
|
||||
- Completed:
|
||||
- `ts/packages/flow/src/chunking/recursive-splitter.ts` now returns
|
||||
`Chunk.Chunk<string>` and uses `effect/Chunk` for splitter result, merge,
|
||||
recursive append, and overlap collections.
|
||||
- The chunking service behavior is unchanged; `Chunk` is iterable and still
|
||||
provides `length` for logging and output counting.
|
||||
- Splitter and service tests convert `Chunk` results to readonly arrays only
|
||||
at assertion boundaries.
|
||||
- The focused chunking scan no longer has array-backed splitter result state.
|
||||
- Verification:
|
||||
- `cd ts/packages/flow && bunx --bun vitest run src/__tests__/recursive-splitter.test.ts src/__tests__/chunking-service.test.ts`
|
||||
- `cd ts && bun run check:tsgo`
|
||||
- `cd ts && bun run build`
|
||||
- `cd ts && bun run test`
|
||||
- `cd ts && bun run lint`
|
||||
- `git diff --check`
|
||||
|
||||
## Subagent Findings To Preserve
|
||||
|
||||
- MCP/workbench:
|
||||
|
|
@ -2342,9 +2362,9 @@ Notes:
|
|||
broker receive/error payload boundaries remain numeric milliseconds.
|
||||
- CLI modernization remains valid, but the live installed target is
|
||||
`effect/unstable/cli` rather than an installed `@effect/cli` package.
|
||||
- Chunking remains a small valid `effect/Chunk` slice: the recursive
|
||||
splitter is still array/mutation based and can expose `Chunk.Chunk<string>`
|
||||
internally while preserving service behavior.
|
||||
- Chunking `effect/Chunk` migration is complete: the recursive splitter now
|
||||
returns `Chunk.Chunk<string>` and converts to arrays only at test/assertion
|
||||
boundaries.
|
||||
- Knowledge core internals are largely Effect-native, but public core service
|
||||
facades still expose Promise methods; migrate tests to Effect-first
|
||||
methods before shrinking those facades.
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import { describe, expect, it } from "@effect/vitest";
|
||||
import { ConfigProvider, Effect, Fiber } from "effect";
|
||||
import * as EffectChunk from "effect/Chunk";
|
||||
import {
|
||||
MessagingRuntimeLive,
|
||||
PubSub,
|
||||
|
|
@ -212,7 +213,7 @@ describe("ChunkingService", () => {
|
|||
inputConsumer.push(createMessage(document, { id: "request-1" }));
|
||||
|
||||
const outputProducer = backend.producersByTopic.get("chunk-output-topic") as RecordingProducer<Chunk>;
|
||||
const expectedChunks = recursiveSplit(document.text, 18, 0);
|
||||
const expectedChunks = EffectChunk.toReadonlyArray(recursiveSplit(document.text, 18, 0));
|
||||
yield* waitFor(() => outputProducer.sent.length === expectedChunks.length, "chunk outputs");
|
||||
|
||||
expect(inputConsumer.acknowledged.length).toBe(1);
|
||||
|
|
|
|||
|
|
@ -1,26 +1,34 @@
|
|||
import { describe, it, expect } from "vitest";
|
||||
import * as EffectChunk from "effect/Chunk";
|
||||
import { recursiveSplit } from "../chunking/recursive-splitter.js";
|
||||
|
||||
const splitToArray = (
|
||||
text: string,
|
||||
chunkSize: number,
|
||||
chunkOverlap: number,
|
||||
): ReadonlyArray<string> =>
|
||||
EffectChunk.toReadonlyArray(recursiveSplit(text, chunkSize, chunkOverlap));
|
||||
|
||||
describe("recursiveSplit", () => {
|
||||
// ── Short text returns single chunk ──────────────────────────────
|
||||
it("returns single chunk when text is shorter than chunkSize", () => {
|
||||
const result = recursiveSplit("Hello world", 100, 10);
|
||||
const result = splitToArray("Hello world", 100, 10);
|
||||
expect(result).toEqual(["Hello world"]);
|
||||
});
|
||||
|
||||
// ── Empty/whitespace text returns empty array ────────────────────
|
||||
it("returns empty array for empty string", () => {
|
||||
expect(recursiveSplit("", 100, 10)).toEqual([]);
|
||||
expect(splitToArray("", 100, 10)).toEqual([]);
|
||||
});
|
||||
|
||||
it("returns empty array for whitespace-only text", () => {
|
||||
expect(recursiveSplit(" \n\n \n ", 100, 10)).toEqual([]);
|
||||
expect(splitToArray(" \n\n \n ", 100, 10)).toEqual([]);
|
||||
});
|
||||
|
||||
// ── Splits on paragraph boundary (\n\n) first ───────────────────
|
||||
it("splits on paragraph boundary (\\n\\n) first", () => {
|
||||
const text = "Paragraph one content here.\n\nParagraph two content here.";
|
||||
const result = recursiveSplit(text, 30, 0);
|
||||
const result = splitToArray(text, 30, 0);
|
||||
expect(result.length).toBeGreaterThanOrEqual(2);
|
||||
// Each chunk should contain content from its respective paragraph
|
||||
expect(result[0]).toContain("Paragraph one");
|
||||
|
|
@ -30,7 +38,7 @@ describe("recursiveSplit", () => {
|
|||
// ── Splits on \n when no \n\n present ────────────────────────────
|
||||
it("splits on newline when no paragraph boundary present", () => {
|
||||
const text = "Line one content.\nLine two content.\nLine three content.";
|
||||
const result = recursiveSplit(text, 25, 0);
|
||||
const result = splitToArray(text, 25, 0);
|
||||
expect(result.length).toBeGreaterThanOrEqual(2);
|
||||
expect(result[0]).toContain("Line one");
|
||||
});
|
||||
|
|
@ -38,7 +46,7 @@ describe("recursiveSplit", () => {
|
|||
// ── Splits on spaces when no newlines present ────────────────────
|
||||
it("splits on spaces when no newlines present", () => {
|
||||
const text = "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10";
|
||||
const result = recursiveSplit(text, 20, 0);
|
||||
const result = splitToArray(text, 20, 0);
|
||||
expect(result.length).toBeGreaterThanOrEqual(2);
|
||||
// Each chunk should be at most roughly chunkSize
|
||||
for (const chunk of result) {
|
||||
|
|
@ -51,7 +59,7 @@ describe("recursiveSplit", () => {
|
|||
it("splits at character level as last resort", () => {
|
||||
// A single long word with no separators
|
||||
const text = "abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz";
|
||||
const result = recursiveSplit(text, 10, 0);
|
||||
const result = splitToArray(text, 10, 0);
|
||||
expect(result.length).toBeGreaterThanOrEqual(2);
|
||||
// Reassembled text should match original
|
||||
expect(result.join("")).toBe(text);
|
||||
|
|
@ -60,7 +68,7 @@ describe("recursiveSplit", () => {
|
|||
// ── Overlap: second chunk starts with tail of first ──────────────
|
||||
it("applies overlap so second chunk starts with tail of first", () => {
|
||||
const text = "First paragraph here.\n\nSecond paragraph here.";
|
||||
const result = recursiveSplit(text, 25, 5);
|
||||
const result = splitToArray(text, 25, 5);
|
||||
expect(result.length).toBeGreaterThanOrEqual(2);
|
||||
if (result.length >= 2) {
|
||||
// The second chunk should start with the last 5 chars of the first
|
||||
|
|
@ -77,14 +85,14 @@ describe("recursiveSplit", () => {
|
|||
(_, i) => `This is paragraph number ${i + 1} with some filler content to make it longer.`,
|
||||
);
|
||||
const text = paragraphs.join("\n\n");
|
||||
const result = recursiveSplit(text, 100, 10);
|
||||
const result = splitToArray(text, 100, 10);
|
||||
expect(result.length).toBeGreaterThan(5);
|
||||
});
|
||||
|
||||
// ── chunkOverlap=0 produces no overlap ───────────────────────────
|
||||
it("chunkOverlap=0 produces no overlap between chunks", () => {
|
||||
const text = "AAAA\n\nBBBB\n\nCCCC\n\nDDDD";
|
||||
const result = recursiveSplit(text, 8, 0);
|
||||
const result = splitToArray(text, 8, 0);
|
||||
expect(result.length).toBeGreaterThanOrEqual(2);
|
||||
// With zero overlap, no chunk (except possibly the first) should start with previous chunk's tail
|
||||
for (let i = 1; i < result.length; i++) {
|
||||
|
|
|
|||
|
|
@ -11,13 +11,15 @@
|
|||
* Python reference: trustgraph-flow/trustgraph/chunking/recursive_splitter/service.py
|
||||
*/
|
||||
|
||||
const DEFAULT_SEPARATORS = ["\n\n", "\n", " ", ""];
|
||||
import * as Chunk from "effect/Chunk";
|
||||
|
||||
const DEFAULT_SEPARATORS: ReadonlyArray<string> = ["\n\n", "\n", " ", ""];
|
||||
|
||||
export function recursiveSplit(
|
||||
text: string,
|
||||
chunkSize: number,
|
||||
chunkOverlap: number,
|
||||
): string[] {
|
||||
): Chunk.Chunk<string> {
|
||||
return splitRecursive(text, chunkSize, chunkOverlap, DEFAULT_SEPARATORS);
|
||||
}
|
||||
|
||||
|
|
@ -25,10 +27,10 @@ function splitRecursive(
|
|||
text: string,
|
||||
chunkSize: number,
|
||||
chunkOverlap: number,
|
||||
separators: string[],
|
||||
): string[] {
|
||||
separators: ReadonlyArray<string>,
|
||||
): Chunk.Chunk<string> {
|
||||
if (text.length <= chunkSize) {
|
||||
return text.trim().length > 0 ? [text] : [];
|
||||
return text.trim().length > 0 ? Chunk.of(text) : Chunk.empty();
|
||||
}
|
||||
|
||||
// Find the best separator that exists in the text
|
||||
|
|
@ -51,13 +53,13 @@ function splitRecursive(
|
|||
const merged = mergePieces(pieces, separator, chunkSize);
|
||||
|
||||
// Recursively split oversized chunks with the next separator
|
||||
const results: string[] = [];
|
||||
let results = Chunk.empty<string>();
|
||||
for (const chunk of merged) {
|
||||
if (chunk.length > chunkSize && remainingSeparators.length > 0) {
|
||||
const subChunks = splitRecursive(chunk, chunkSize, chunkOverlap, remainingSeparators);
|
||||
results.push(...subChunks);
|
||||
results = Chunk.appendAll(results, subChunks);
|
||||
} else if (chunk.trim().length > 0) {
|
||||
results.push(chunk);
|
||||
results = Chunk.append(results, chunk);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -66,18 +68,18 @@ function splitRecursive(
|
|||
}
|
||||
|
||||
function mergePieces(
|
||||
pieces: string[],
|
||||
pieces: ReadonlyArray<string>,
|
||||
separator: string,
|
||||
chunkSize: number,
|
||||
): string[] {
|
||||
const chunks: string[] = [];
|
||||
): Chunk.Chunk<string> {
|
||||
let chunks = Chunk.empty<string>();
|
||||
let current = "";
|
||||
|
||||
for (const piece of pieces) {
|
||||
const candidate = current.length > 0 ? current + separator + piece : piece;
|
||||
|
||||
if (candidate.length > chunkSize && current.length > 0) {
|
||||
chunks.push(current);
|
||||
chunks = Chunk.append(chunks, current);
|
||||
current = piece;
|
||||
} else {
|
||||
current = candidate;
|
||||
|
|
@ -85,21 +87,26 @@ function mergePieces(
|
|||
}
|
||||
|
||||
if (current.length > 0) {
|
||||
chunks.push(current);
|
||||
chunks = Chunk.append(chunks, current);
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
function applyOverlap(chunks: string[], overlapSize: number): string[] {
|
||||
function applyOverlap(chunks: Chunk.Chunk<string>, overlapSize: number): Chunk.Chunk<string> {
|
||||
if (overlapSize <= 0 || chunks.length <= 1) return chunks;
|
||||
|
||||
const result: string[] = [chunks[0]];
|
||||
let result = Chunk.empty<string>();
|
||||
let previous: string | undefined;
|
||||
|
||||
for (let i = 1; i < chunks.length; i++) {
|
||||
const prev = chunks[i - 1];
|
||||
const overlapText = prev.slice(Math.max(0, prev.length - overlapSize));
|
||||
result.push(overlapText + chunks[i]);
|
||||
for (const chunk of chunks) {
|
||||
if (previous === undefined) {
|
||||
result = Chunk.append(result, chunk);
|
||||
} else {
|
||||
const overlapText = previous.slice(Math.max(0, previous.length - overlapSize));
|
||||
result = Chunk.append(result, overlapText + chunk);
|
||||
}
|
||||
previous = chunk;
|
||||
}
|
||||
|
||||
return result;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue