mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-07-01 09:29:38 +02:00
feat: add document pipeline, ReAct agent, and knowledge core services
Document Pipeline (Team A): - LibrarianService: document storage with filesystem backend, metadata persistence, child document hierarchy, collection management - ChunkingService: recursive character text splitter with configurable chunk size/overlap, FlowProcessor pattern - KnowledgeExtractService: combined relationship + definition extraction using prompt service and LLM, emits RDF triples and entity contexts - KnowledgeCoreService: knowledge core CRUD with streaming export and flow-based loading ReAct Agent (Team B): - StreamingReActParser: state machine for parsing LLM output into Thought/Action/ActionInput/FinalAnswer sections - Three MVP tools: KnowledgeQuery (GraphRAG), DocumentQuery (DocRAG), TriplesQuery with RequestResponse clients - AgentService FlowProcessor with ReAct loop, tool execution, and streaming chunk responses (thought/observation/answer) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
5ed3f0e2d8
commit
f09ef4de45
18 changed files with 2145 additions and 2 deletions
106
ts/packages/flow/src/chunking/recursive-splitter.ts
Normal file
106
ts/packages/flow/src/chunking/recursive-splitter.ts
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
/**
|
||||
* Recursive character text splitter.
|
||||
*
|
||||
* Matches the behaviour of LangChain's RecursiveCharacterTextSplitter:
|
||||
* 1. Try separators in order: "\n\n", "\n", " ", ""
|
||||
* 2. Split on the best separator that exists in the text
|
||||
* 3. Merge small pieces until they approach chunkSize
|
||||
* 4. Recursively split pieces that exceed chunkSize with the next separator
|
||||
* 5. Apply overlap: include trailing chunkOverlap chars from the previous chunk
|
||||
*
|
||||
* Python reference: trustgraph-flow/trustgraph/chunking/recursive_splitter/service.py
|
||||
*/
|
||||
|
||||
const DEFAULT_SEPARATORS = ["\n\n", "\n", " ", ""];
|
||||
|
||||
export function recursiveSplit(
|
||||
text: string,
|
||||
chunkSize: number,
|
||||
chunkOverlap: number,
|
||||
): string[] {
|
||||
return splitRecursive(text, chunkSize, chunkOverlap, DEFAULT_SEPARATORS);
|
||||
}
|
||||
|
||||
function splitRecursive(
|
||||
text: string,
|
||||
chunkSize: number,
|
||||
chunkOverlap: number,
|
||||
separators: string[],
|
||||
): string[] {
|
||||
if (text.length <= chunkSize) {
|
||||
return text.trim().length > 0 ? [text] : [];
|
||||
}
|
||||
|
||||
// Find the best separator that exists in the text
|
||||
let separator = "";
|
||||
let remainingSeparators = separators;
|
||||
|
||||
for (let i = 0; i < separators.length; i++) {
|
||||
const sep = separators[i];
|
||||
if (sep === "" || text.includes(sep)) {
|
||||
separator = sep;
|
||||
remainingSeparators = separators.slice(i + 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Split on the selected separator
|
||||
const pieces = separator === "" ? [...text] : text.split(separator);
|
||||
|
||||
// Merge small pieces into chunks
|
||||
const merged = mergePieces(pieces, separator, chunkSize);
|
||||
|
||||
// Recursively split oversized chunks with the next separator
|
||||
const results: string[] = [];
|
||||
for (const chunk of merged) {
|
||||
if (chunk.length > chunkSize && remainingSeparators.length > 0) {
|
||||
const subChunks = splitRecursive(chunk, chunkSize, chunkOverlap, remainingSeparators);
|
||||
results.push(...subChunks);
|
||||
} else if (chunk.trim().length > 0) {
|
||||
results.push(chunk);
|
||||
}
|
||||
}
|
||||
|
||||
// Apply overlap
|
||||
return applyOverlap(results, chunkOverlap);
|
||||
}
|
||||
|
||||
function mergePieces(
|
||||
pieces: string[],
|
||||
separator: string,
|
||||
chunkSize: number,
|
||||
): string[] {
|
||||
const chunks: string[] = [];
|
||||
let current = "";
|
||||
|
||||
for (const piece of pieces) {
|
||||
const candidate = current.length > 0 ? current + separator + piece : piece;
|
||||
|
||||
if (candidate.length > chunkSize && current.length > 0) {
|
||||
chunks.push(current);
|
||||
current = piece;
|
||||
} else {
|
||||
current = candidate;
|
||||
}
|
||||
}
|
||||
|
||||
if (current.length > 0) {
|
||||
chunks.push(current);
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
function applyOverlap(chunks: string[], overlapSize: number): string[] {
|
||||
if (overlapSize <= 0 || chunks.length <= 1) return chunks;
|
||||
|
||||
const result: string[] = [chunks[0]];
|
||||
|
||||
for (let i = 1; i < chunks.length; i++) {
|
||||
const prev = chunks[i - 1];
|
||||
const overlapText = prev.slice(Math.max(0, prev.length - overlapSize));
|
||||
result.push(overlapText + chunks[i]);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
94
ts/packages/flow/src/chunking/service.ts
Normal file
94
ts/packages/flow/src/chunking/service.ts
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
/**
|
||||
* Chunking service — splits text documents into chunks for downstream processing.
|
||||
*
|
||||
* A FlowProcessor that:
|
||||
* 1. Consumes TextDocument messages
|
||||
* 2. Splits text using recursive character text splitting
|
||||
* 3. Emits Chunk messages for each resulting chunk
|
||||
*
|
||||
* Python reference: trustgraph-flow/trustgraph/chunking/recursive_splitter/service.py
|
||||
*/
|
||||
|
||||
import {
|
||||
FlowProcessor,
|
||||
ConsumerSpec,
|
||||
ProducerSpec,
|
||||
ParameterSpec,
|
||||
type ProcessorConfig,
|
||||
type FlowContext,
|
||||
type TextDocument,
|
||||
type Chunk,
|
||||
type Triples,
|
||||
} from "@trustgraph/base";
|
||||
import { recursiveSplit } from "./recursive-splitter.js";
|
||||
|
||||
const DEFAULT_CHUNK_SIZE = 2000;
|
||||
const DEFAULT_CHUNK_OVERLAP = 100;
|
||||
|
||||
export class ChunkingService extends FlowProcessor {
|
||||
constructor(config: ProcessorConfig) {
|
||||
super(config);
|
||||
|
||||
this.registerSpecification(
|
||||
new ConsumerSpec<TextDocument>("input", this.onMessage.bind(this)),
|
||||
);
|
||||
this.registerSpecification(new ProducerSpec<Chunk>("output"));
|
||||
this.registerSpecification(new ProducerSpec<Triples>("triples"));
|
||||
this.registerSpecification(new ParameterSpec("chunk-size"));
|
||||
this.registerSpecification(new ParameterSpec("chunk-overlap"));
|
||||
|
||||
console.log("[ChunkingService] Service initialized");
|
||||
}
|
||||
|
||||
private async onMessage(
|
||||
msg: TextDocument,
|
||||
properties: Record<string, string>,
|
||||
flowCtx: FlowContext,
|
||||
): Promise<void> {
|
||||
const requestId = properties.id;
|
||||
if (!requestId) return;
|
||||
|
||||
let chunkSize: number;
|
||||
let chunkOverlap: number;
|
||||
|
||||
try {
|
||||
chunkSize = flowCtx.flow.parameter<number>("chunk-size");
|
||||
} catch {
|
||||
chunkSize = DEFAULT_CHUNK_SIZE;
|
||||
}
|
||||
|
||||
try {
|
||||
chunkOverlap = flowCtx.flow.parameter<number>("chunk-overlap");
|
||||
} catch {
|
||||
chunkOverlap = DEFAULT_CHUNK_OVERLAP;
|
||||
}
|
||||
|
||||
const text = msg.text;
|
||||
if (!text || text.trim().length === 0) {
|
||||
console.warn(`[ChunkingService] Empty text received for document ${msg.documentId}`);
|
||||
return;
|
||||
}
|
||||
|
||||
const chunks = recursiveSplit(text, chunkSize, chunkOverlap);
|
||||
|
||||
console.log(
|
||||
`[ChunkingService] Split document ${msg.documentId} into ${chunks.length} chunks (size=${chunkSize}, overlap=${chunkOverlap})`,
|
||||
);
|
||||
|
||||
const outputProducer = flowCtx.flow.producer<Chunk>("output");
|
||||
|
||||
for (const chunkText of chunks) {
|
||||
const chunk: Chunk = {
|
||||
metadata: msg.metadata,
|
||||
chunk: chunkText,
|
||||
documentId: msg.documentId,
|
||||
};
|
||||
|
||||
await outputProducer.send(requestId, chunk);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export async function run(): Promise<void> {
|
||||
await ChunkingService.launch("chunking");
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue