trustgraph/ts/packages/flow/src/extract/knowledge-extract.ts

404 lines
12 KiB
TypeScript
Raw Normal View History

/**
* Knowledge extraction service extracts relationships and definitions from text chunks.
*
* A FlowProcessor that:
* 1. Consumes Chunk messages
* 2. Uses prompt service + LLM to extract relationships and definitions
* 3. Converts extractions into RDF triples and entity contexts
* 4. Emits Triples and EntityContexts messages
*
* Python reference: trustgraph-flow/trustgraph/extract/knowledge/service.py
*/
import {
2026-06-01 20:26:47 -05:00
makeFlowProcessor,
makeConsumerSpec,
makeProducerSpec,
makeRequestResponseSpec,
2026-06-01 16:22:25 -05:00
makeFlowProcessorProgram,
type ProcessorConfig,
2026-06-01 20:26:47 -05:00
type FlowProcessorRuntime,
type FlowContext,
type Chunk,
type Triples,
type EntityContexts,
type EntityContext,
type PromptRequest,
type PromptResponse,
type TextCompletionRequest,
type TextCompletionResponse,
type Triple,
type Term,
2026-06-01 16:22:25 -05:00
type FlowResourceNotFoundError,
type MessagingDeliveryError,
type EffectRequestResponse,
type Spec,
} from "@trustgraph/base";
import { NodeRuntime } from "@effect/platform-node";
import { Effect, Layer, ManagedRuntime } from "effect";
2026-06-01 16:22:25 -05:00
import * as O from "effect/Option";
import * as S from "effect/Schema";
// Well-known RDF/SKOS IRIs
const RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label";
const SKOS_DEFINITION = "http://www.w3.org/2004/02/skos/core#definition";
2026-06-01 16:22:25 -05:00
const ExtractedRelationship = S.Struct({
subject: S.String,
predicate: S.String,
object: S.String,
});
type ExtractedRelationship = typeof ExtractedRelationship.Type;
2026-06-01 16:22:25 -05:00
const ExtractedRelationshipsFromJson = S.Array(ExtractedRelationship).pipe(S.fromJsonString);
const decodeExtractedRelationships = S.decodeUnknownOption(ExtractedRelationshipsFromJson);
2026-06-01 16:22:25 -05:00
const ExtractedDefinition = S.Struct({
entity: S.String,
definition: S.String,
});
type ExtractedDefinition = typeof ExtractedDefinition.Type;
const ExtractedDefinitionsFromJson = S.Array(ExtractedDefinition).pipe(S.fromJsonString);
const decodeExtractedDefinitions = S.decodeUnknownOption(ExtractedDefinitionsFromJson);
type KnowledgeExtractHandlerError =
| FlowResourceNotFoundError
| MessagingDeliveryError;
type PromptClient = EffectRequestResponse<PromptRequest, PromptResponse>;
type LlmClient = EffectRequestResponse<TextCompletionRequest, TextCompletionResponse>;
2026-06-02 03:23:23 -05:00
const ExtractTriplesProducer = makeProducerSpec<Triples>("extract-triples");
const ExtractEntityContextsProducer = makeProducerSpec<EntityContexts>("extract-entity-contexts");
const PromptClientSpec = makeRequestResponseSpec<PromptRequest, PromptResponse>(
"prompt-client",
"prompt-request",
"prompt-response",
);
const LlmClientSpec = makeRequestResponseSpec<TextCompletionRequest, TextCompletionResponse>(
"llm-client",
"text-completion-request",
"text-completion-response",
);
2026-06-01 16:22:25 -05:00
const requestPrompt = Effect.fn("KnowledgeExtract.requestPrompt")(function* (
promptClient: PromptClient,
name: string,
text: string,
) {
return yield* promptClient.request(
{ name, variables: { text } },
{ timeoutMs: 10_000 },
);
});
2026-06-01 16:22:25 -05:00
const requestCompletion = Effect.fn("KnowledgeExtract.requestCompletion")(function* (
llmClient: LlmClient,
prompt: PromptResponse,
) {
return yield* llmClient.request(
{ system: prompt.system, prompt: prompt.prompt },
{ timeoutMs: 120_000 },
);
});
2026-06-01 16:22:25 -05:00
const extractRelationships = Effect.fn("KnowledgeExtract.extractRelationships")(function* (
promptClient: PromptClient,
llmClient: LlmClient,
text: string,
) {
const relPrompt = yield* requestPrompt(promptClient, "extract-relationships", text);
if (relPrompt.error !== undefined) return null;
for (let attempt = 0; attempt < 3; attempt++) {
const relCompletion = yield* requestCompletion(llmClient, relPrompt);
if (relCompletion.error !== undefined || relCompletion.response.length === 0) {
break;
}
const relationships = parseRelationshipsResponse(relCompletion.response);
if (relationships !== null) return relationships;
yield* Effect.logWarning(
`[KnowledgeExtract] Relationship parse failed, attempt ${attempt + 1}/3`,
);
}
2026-06-01 16:22:25 -05:00
return null;
});
const extractDefinitions = Effect.fn("KnowledgeExtract.extractDefinitions")(function* (
promptClient: PromptClient,
llmClient: LlmClient,
text: string,
) {
const defPrompt = yield* requestPrompt(promptClient, "extract-definitions", text);
if (defPrompt.error !== undefined) return null;
for (let attempt = 0; attempt < 3; attempt++) {
const defCompletion = yield* requestCompletion(llmClient, defPrompt);
if (defCompletion.error !== undefined || defCompletion.response.length === 0) {
break;
}
2026-06-01 16:22:25 -05:00
const definitions = parseDefinitionsResponse(defCompletion.response);
if (definitions !== null) return definitions;
yield* Effect.logWarning(
`[KnowledgeExtract] Definition parse failed, attempt ${attempt + 1}/3`,
);
}
return null;
});
const onKnowledgeExtractMessage = Effect.fn("KnowledgeExtractService.onMessage")(function* (
msg: Chunk,
properties: Record<string, string>,
flowCtx: FlowContext,
): Effect.fn.Return<void, KnowledgeExtractHandlerError> {
const requestId = properties.id;
if (requestId === undefined || requestId.length === 0) return;
const text = msg.chunk;
if (text.trim().length === 0) return;
2026-06-02 03:23:23 -05:00
const promptClient = yield* flowCtx.flow.requestorEffect(PromptClientSpec);
const llmClient = yield* flowCtx.flow.requestorEffect(LlmClientSpec);
const triplesProducer = yield* flowCtx.flow.producerEffect(ExtractTriplesProducer);
const entityContextsProducer = yield* flowCtx.flow.producerEffect(ExtractEntityContextsProducer);
2026-06-01 16:22:25 -05:00
const allTriples: Triple[] = [];
const allEntityContexts: EntityContext[] = [];
const relationships = yield* extractRelationships(promptClient, llmClient, text).pipe(
Effect.catch((error: unknown) =>
Effect.logError("[KnowledgeExtract] Relationship extraction failed", {
error: error instanceof Error ? error.message : String(error),
}).pipe(Effect.as(null)),
),
);
if (relationships !== null) {
for (const rel of relationships) {
if (
rel.subject.length === 0 ||
rel.predicate.length === 0 ||
rel.object.length === 0
) {
continue;
}
2026-06-01 16:22:25 -05:00
const subjectIri = toEntityIri(rel.subject);
const predicateIri = toEntityIri(rel.predicate);
const objectIri = toEntityIri(rel.object);
allTriples.push({ s: subjectIri, p: predicateIri, o: objectIri });
allTriples.push({
s: subjectIri,
p: iriTerm(RDFS_LABEL),
o: literalTerm(rel.subject),
});
allTriples.push({
s: predicateIri,
p: iriTerm(RDFS_LABEL),
o: literalTerm(rel.predicate),
});
allTriples.push({
s: objectIri,
p: iriTerm(RDFS_LABEL),
o: literalTerm(rel.object),
});
allEntityContexts.push({
entity: subjectIri,
context: text,
chunkId: msg.documentId,
});
allEntityContexts.push({
entity: objectIri,
context: text,
chunkId: msg.documentId,
});
}
2026-06-01 16:22:25 -05:00
yield* Effect.log(`[KnowledgeExtract] Extracted ${relationships.length} relationships`);
}
const definitions = yield* extractDefinitions(promptClient, llmClient, text).pipe(
Effect.catch((error: unknown) =>
Effect.logError("[KnowledgeExtract] Definition extraction failed", {
error: error instanceof Error ? error.message : String(error),
}).pipe(Effect.as(null)),
),
);
if (definitions !== null) {
for (const def of definitions) {
if (def.entity.length === 0 || def.definition.length === 0) continue;
const entityIri = toEntityIri(def.entity);
allTriples.push({
s: entityIri,
p: iriTerm(SKOS_DEFINITION),
o: literalTerm(def.definition),
});
allTriples.push({
s: entityIri,
p: iriTerm(RDFS_LABEL),
o: literalTerm(def.entity),
});
allEntityContexts.push({
entity: entityIri,
context: text,
chunkId: msg.documentId,
});
}
2026-06-01 16:22:25 -05:00
yield* Effect.log(`[KnowledgeExtract] Extracted ${definitions.length} definitions`);
}
if (allTriples.length > 0) {
yield* triplesProducer.send(requestId, {
metadata: msg.metadata,
triples: allTriples,
});
}
if (allEntityContexts.length > 0) {
yield* entityContextsProducer.send(requestId, {
metadata: msg.metadata,
entities: allEntityContexts,
});
}
});
export const makeKnowledgeExtractSpecs = (): ReadonlyArray<Spec<never>> => [
2026-06-01 20:26:47 -05:00
makeConsumerSpec<Chunk, KnowledgeExtractHandlerError>(
2026-06-01 16:22:25 -05:00
"extract-input",
onKnowledgeExtractMessage,
),
2026-06-02 03:23:23 -05:00
ExtractTriplesProducer,
ExtractEntityContextsProducer,
PromptClientSpec,
LlmClientSpec,
2026-06-01 16:22:25 -05:00
];
2026-06-01 20:26:47 -05:00
export type KnowledgeExtractService = FlowProcessorRuntime;
2026-06-01 16:22:25 -05:00
2026-06-01 20:26:47 -05:00
export function makeKnowledgeExtractService(config: ProcessorConfig): KnowledgeExtractService {
const service = makeFlowProcessor(config, {
specifications: makeKnowledgeExtractSpecs(),
});
2026-06-01 23:19:54 -05:00
Effect.runSync(Effect.log("[KnowledgeExtract] Service initialized"));
2026-06-01 20:26:47 -05:00
return service;
}
2026-06-01 20:26:47 -05:00
export const KnowledgeExtractService = makeKnowledgeExtractService;
// ---------- Helpers ----------
function toEntityIri(name: string): Term {
const slug = encodeURIComponent(name.toLowerCase().replace(/\s+/g, "-"));
return {
type: "IRI",
iri: `http://trustgraph.ai/e/${slug}`,
};
}
function iriTerm(iri: string): Term {
return { type: "IRI", iri };
}
function literalTerm(value: string): Term {
return { type: "LITERAL", value };
}
/**
* Parse JSON from LLM output, handling markdown code fences and malformed output.
* Uses progressive fallback: direct parse, array extraction, truncated array repair, single object wrap.
*/
export function parseJsonResponse<T>(raw: string): T | null {
2026-06-01 16:22:25 -05:00
const decodeJson = S.decodeUnknownOption(S.UnknownFromJsonString);
for (const candidate of jsonCandidates(raw)) {
const decoded = decodeJson(candidate);
if (O.isSome(decoded)) return decoded.value as T;
}
2026-06-01 23:19:54 -05:00
Effect.runSync(Effect.logWarning("[KnowledgeExtract] Failed to parse JSON from LLM response", {
response: raw.slice(0, 300),
}));
2026-06-01 16:22:25 -05:00
return null;
}
function parseRelationshipsResponse(raw: string): ReadonlyArray<ExtractedRelationship> | null {
for (const candidate of jsonCandidates(raw)) {
const decoded = decodeExtractedRelationships(candidate);
if (O.isSome(decoded)) return decoded.value;
}
2026-06-01 23:19:54 -05:00
Effect.runSync(Effect.logWarning("[KnowledgeExtract] Failed to parse relationships from LLM response", {
response: raw.slice(0, 300),
}));
2026-06-01 16:22:25 -05:00
return null;
}
function parseDefinitionsResponse(raw: string): ReadonlyArray<ExtractedDefinition> | null {
for (const candidate of jsonCandidates(raw)) {
const decoded = decodeExtractedDefinitions(candidate);
if (O.isSome(decoded)) return decoded.value;
}
2026-06-01 23:19:54 -05:00
Effect.runSync(Effect.logWarning("[KnowledgeExtract] Failed to parse definitions from LLM response", {
response: raw.slice(0, 300),
}));
2026-06-01 16:22:25 -05:00
return null;
}
function jsonCandidates(raw: string): ReadonlyArray<string> {
const candidates: string[] = [];
let cleaned = raw.trim();
const fenceMatch = cleaned.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?```$/);
2026-05-12 08:06:58 -05:00
if (fenceMatch !== null) {
cleaned = (fenceMatch[1] ?? "").trim();
}
2026-06-01 16:22:25 -05:00
candidates.push(cleaned);
const arrayMatch = cleaned.match(/\[[\s\S]*\]/);
2026-05-12 08:06:58 -05:00
if (arrayMatch !== null) {
2026-06-01 16:22:25 -05:00
candidates.push(arrayMatch[0]);
const partial = arrayMatch[0];
2026-06-01 16:22:25 -05:00
const lastBrace = partial.lastIndexOf("}");
if (lastBrace > 0) {
2026-06-01 16:22:25 -05:00
candidates.push(partial.slice(0, lastBrace + 1) + "]");
}
}
const objMatch = cleaned.match(/\{[\s\S]*?\}/);
2026-05-12 08:06:58 -05:00
if (objMatch !== null) {
2026-06-01 16:22:25 -05:00
candidates.push(`[${objMatch[0]}]`);
}
2026-06-01 16:22:25 -05:00
return candidates;
}
2026-06-01 16:22:25 -05:00
export const program = makeFlowProcessorProgram({
2026-05-12 08:06:58 -05:00
id: "knowledge-extract",
2026-06-01 16:22:25 -05:00
specs: () => makeKnowledgeExtractSpecs(),
2026-05-12 08:06:58 -05:00
});
const knowledgeExtractRuntime = ManagedRuntime.make(Layer.empty);
2026-06-01 23:19:54 -05:00
export function run(): Promise<void> {
return knowledgeExtractRuntime.runPromise(program);
}
export function runMain(): void {
NodeRuntime.runMain(program);
}