trustgraph/ts/packages/flow/src/chunking/recursive-splitter.ts

/**
 * Recursive character text splitter.
 *
 * Matches the behaviour of LangChain's RecursiveCharacterTextSplitter:
 * 1. Try separators in order: "\n\n", "\n", " ", ""
 * 2. Split on the best separator that exists in the text
 * 3. Merge small pieces until they approach chunkSize
 * 4. Recursively split pieces that exceed chunkSize with the next separator
 * 5. Apply overlap: include trailing chunkOverlap chars from the previous chunk
 *
 * Python reference: trustgraph-flow/trustgraph/chunking/recursive_splitter/service.py
 */

const DEFAULT_SEPARATORS = ["\n\n", "\n", " ", ""];

export function recursiveSplit(
  text: string,
  chunkSize: number,
  chunkOverlap: number,
): string[] {
  return splitRecursive(text, chunkSize, chunkOverlap, DEFAULT_SEPARATORS);
}

function splitRecursive(
  text: string,
  chunkSize: number,
  chunkOverlap: number,
  separators: string[],
): string[] {
  if (text.length <= chunkSize) {
    return text.trim().length > 0 ? [text] : [];
  }

  // Find the best separator that exists in the text
  let separator = "";
  let remainingSeparators = separators;

  for (let i = 0; i < separators.length; i++) {
    const sep = separators[i];
    if (sep === "" || text.includes(sep)) {
      separator = sep;
      remainingSeparators = separators.slice(i + 1);
      break;
    }
  }

  // Split on the selected separator
  const pieces = separator === "" ? [...text] : text.split(separator);

  // Merge small pieces into chunks
  const merged = mergePieces(pieces, separator, chunkSize);

  // Recursively split oversized chunks with the next separator
  const results: string[] = [];
  for (const chunk of merged) {
    if (chunk.length > chunkSize && remainingSeparators.length > 0) {
      const subChunks = splitRecursive(chunk, chunkSize, chunkOverlap, remainingSeparators);
      results.push(...subChunks);
    } else if (chunk.trim().length > 0) {
      results.push(chunk);
    }
  }

  // Apply overlap
  return applyOverlap(results, chunkOverlap);
}

function mergePieces(
  pieces: string[],
  separator: string,
  chunkSize: number,
): string[] {
  const chunks: string[] = [];
  let current = "";

  for (const piece of pieces) {
    const candidate = current.length > 0 ? current + separator + piece : piece;

    if (candidate.length > chunkSize && current.length > 0) {
      chunks.push(current);
      current = piece;
    } else {
      current = candidate;
    }
  }

  if (current.length > 0) {
    chunks.push(current);
  }

  return chunks;
}

function applyOverlap(chunks: string[], overlapSize: number): string[] {
  if (overlapSize <= 0 || chunks.length <= 1) return chunks;

  const result: string[] = [chunks[0]];

  for (let i = 1; i < chunks.length; i++) {
    const prev = chunks[i - 1];
    const overlapText = prev.slice(Math.max(0, prev.length - overlapSize));
    result.push(overlapText + chunks[i]);
  }

  return result;
}