trustgraph/ts/packages/flow/src/chunking/recursive-splitter.ts
elpresidank f09ef4de45 feat: add document pipeline, ReAct agent, and knowledge core services
Document Pipeline (Team A):
- LibrarianService: document storage with filesystem backend, metadata
  persistence, child document hierarchy, collection management
- ChunkingService: recursive character text splitter with configurable
  chunk size/overlap, FlowProcessor pattern
- KnowledgeExtractService: combined relationship + definition extraction
  using prompt service and LLM, emits RDF triples and entity contexts
- KnowledgeCoreService: knowledge core CRUD with streaming export and
  flow-based loading

ReAct Agent (Team B):
- StreamingReActParser: state machine for parsing LLM output into
  Thought/Action/ActionInput/FinalAnswer sections
- Three MVP tools: KnowledgeQuery (GraphRAG), DocumentQuery (DocRAG),
  TriplesQuery with RequestResponse clients
- AgentService FlowProcessor with ReAct loop, tool execution, and
  streaming chunk responses (thought/observation/answer)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 00:19:37 -05:00

106 lines
2.8 KiB
TypeScript

/**
* Recursive character text splitter.
*
* Matches the behaviour of LangChain's RecursiveCharacterTextSplitter:
* 1. Try separators in order: "\n\n", "\n", " ", ""
* 2. Split on the best separator that exists in the text
* 3. Merge small pieces until they approach chunkSize
* 4. Recursively split pieces that exceed chunkSize with the next separator
* 5. Apply overlap: include trailing chunkOverlap chars from the previous chunk
*
* Python reference: trustgraph-flow/trustgraph/chunking/recursive_splitter/service.py
*/
const DEFAULT_SEPARATORS = ["\n\n", "\n", " ", ""];
export function recursiveSplit(
text: string,
chunkSize: number,
chunkOverlap: number,
): string[] {
return splitRecursive(text, chunkSize, chunkOverlap, DEFAULT_SEPARATORS);
}
function splitRecursive(
text: string,
chunkSize: number,
chunkOverlap: number,
separators: string[],
): string[] {
if (text.length <= chunkSize) {
return text.trim().length > 0 ? [text] : [];
}
// Find the best separator that exists in the text
let separator = "";
let remainingSeparators = separators;
for (let i = 0; i < separators.length; i++) {
const sep = separators[i];
if (sep === "" || text.includes(sep)) {
separator = sep;
remainingSeparators = separators.slice(i + 1);
break;
}
}
// Split on the selected separator
const pieces = separator === "" ? [...text] : text.split(separator);
// Merge small pieces into chunks
const merged = mergePieces(pieces, separator, chunkSize);
// Recursively split oversized chunks with the next separator
const results: string[] = [];
for (const chunk of merged) {
if (chunk.length > chunkSize && remainingSeparators.length > 0) {
const subChunks = splitRecursive(chunk, chunkSize, chunkOverlap, remainingSeparators);
results.push(...subChunks);
} else if (chunk.trim().length > 0) {
results.push(chunk);
}
}
// Apply overlap
return applyOverlap(results, chunkOverlap);
}
function mergePieces(
pieces: string[],
separator: string,
chunkSize: number,
): string[] {
const chunks: string[] = [];
let current = "";
for (const piece of pieces) {
const candidate = current.length > 0 ? current + separator + piece : piece;
if (candidate.length > chunkSize && current.length > 0) {
chunks.push(current);
current = piece;
} else {
current = candidate;
}
}
if (current.length > 0) {
chunks.push(current);
}
return chunks;
}
function applyOverlap(chunks: string[], overlapSize: number): string[] {
if (overlapSize <= 0 || chunks.length <= 1) return chunks;
const result: string[] = [chunks[0]];
for (let i = 1; i < chunks.length; i++) {
const prev = chunks[i - 1];
const overlapText = prev.slice(Math.max(0, prev.length - overlapSize));
result.push(overlapText + chunks[i]);
}
return result;
}