/** * Knowledge extraction service — extracts relationships and definitions from text chunks. * * A FlowProcessor that: * 1. Consumes Chunk messages * 2. Uses prompt service + LLM to extract relationships and definitions * 3. Converts extractions into RDF triples and entity contexts * 4. Emits Triples and EntityContexts messages * * Python reference: trustgraph-flow/trustgraph/extract/knowledge/service.py */ import { FlowProcessor, ConsumerSpec, ProducerSpec, RequestResponseSpec, type ProcessorConfig, type FlowContext, type Chunk, type Triples, type EntityContexts, type EntityContext, type PromptRequest, type PromptResponse, type TextCompletionRequest, type TextCompletionResponse, type Triple, type Term, } from "@trustgraph/base"; // Well-known RDF/SKOS IRIs const RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"; const SKOS_DEFINITION = "http://www.w3.org/2004/02/skos/core#definition"; interface ExtractedRelationship { subject: string; predicate: string; object: string; } interface ExtractedDefinition { entity: string; definition: string; } export class KnowledgeExtractService extends FlowProcessor { constructor(config: ProcessorConfig) { super(config); this.registerSpecification( new ConsumerSpec("extract-input", this.onMessage.bind(this)), ); this.registerSpecification(new ProducerSpec("extract-triples")); this.registerSpecification(new ProducerSpec("extract-entity-contexts")); this.registerSpecification( new RequestResponseSpec( "prompt-client", "prompt-request", "prompt-response", ), ); this.registerSpecification( new RequestResponseSpec( "llm-client", "text-completion-request", "text-completion-response", ), ); console.log("[KnowledgeExtract] Service initialized"); } private async onMessage( msg: Chunk, properties: Record, flowCtx: FlowContext, ): Promise { const requestId = properties.id; if (!requestId) return; const text = msg.chunk; if (!text || text.trim().length === 0) return; const promptClient = flowCtx.flow.requestor("prompt-client"); const llmClient = flowCtx.flow.requestor("llm-client"); const triplesProducer = flowCtx.flow.producer("extract-triples"); const entityContextsProducer = flowCtx.flow.producer("extract-entity-contexts"); const allTriples: Triple[] = []; const allEntityContexts: EntityContext[] = []; // --- Extract relationships --- try { const relPrompt = await promptClient.request( { name: "extract-relationships", variables: { text } }, { timeoutMs: 10_000 }, ); if (!relPrompt.error) { let relationships: ExtractedRelationship[] | null = null; for (let attempt = 0; attempt < 3; attempt++) { const relCompletion = await llmClient.request( { system: relPrompt.system, prompt: relPrompt.prompt }, { timeoutMs: 120_000 }, ); if (!relCompletion.error && relCompletion.response) { relationships = parseJsonResponse(relCompletion.response); if (relationships) break; console.warn(`[KnowledgeExtract] Relationship parse failed, attempt ${attempt + 1}/3`); } else { break; // LLM error, don't retry } } if (relationships) { for (const rel of relationships) { if (!rel.subject || !rel.predicate || !rel.object) continue; const subjectIri = toEntityIri(rel.subject); const predicateIri = toEntityIri(rel.predicate); const objectIri = toEntityIri(rel.object); // Main relationship triple allTriples.push({ s: subjectIri, p: predicateIri, o: objectIri }); // rdfs:label triples for each entity allTriples.push({ s: subjectIri, p: iriTerm(RDFS_LABEL), o: literalTerm(rel.subject), }); allTriples.push({ s: predicateIri, p: iriTerm(RDFS_LABEL), o: literalTerm(rel.predicate), }); allTriples.push({ s: objectIri, p: iriTerm(RDFS_LABEL), o: literalTerm(rel.object), }); // Entity contexts for subject and object allEntityContexts.push({ entity: subjectIri, context: text, chunkId: msg.documentId, }); allEntityContexts.push({ entity: objectIri, context: text, chunkId: msg.documentId, }); } console.log(`[KnowledgeExtract] Extracted ${relationships.length} relationships`); } } } catch (err) { console.error("[KnowledgeExtract] Relationship extraction failed:", err); } // --- Extract definitions --- try { const defPrompt = await promptClient.request( { name: "extract-definitions", variables: { text } }, { timeoutMs: 10_000 }, ); if (!defPrompt.error) { let definitions: ExtractedDefinition[] | null = null; for (let attempt = 0; attempt < 3; attempt++) { const defCompletion = await llmClient.request( { system: defPrompt.system, prompt: defPrompt.prompt }, { timeoutMs: 120_000 }, ); if (!defCompletion.error && defCompletion.response) { definitions = parseJsonResponse(defCompletion.response); if (definitions) break; console.warn(`[KnowledgeExtract] Definition parse failed, attempt ${attempt + 1}/3`); } else { break; // LLM error, don't retry } } if (definitions) { for (const def of definitions) { if (!def.entity || !def.definition) continue; const entityIri = toEntityIri(def.entity); // Definition triple allTriples.push({ s: entityIri, p: iriTerm(SKOS_DEFINITION), o: literalTerm(def.definition), }); // Label triple allTriples.push({ s: entityIri, p: iriTerm(RDFS_LABEL), o: literalTerm(def.entity), }); // Entity context allEntityContexts.push({ entity: entityIri, context: text, chunkId: msg.documentId, }); } console.log(`[KnowledgeExtract] Extracted ${definitions.length} definitions`); } } } catch (err) { console.error("[KnowledgeExtract] Definition extraction failed:", err); } // --- Emit results --- if (allTriples.length > 0) { await triplesProducer.send(requestId, { metadata: msg.metadata, triples: allTriples, }); } if (allEntityContexts.length > 0) { await entityContextsProducer.send(requestId, { metadata: msg.metadata, entities: allEntityContexts, }); } } } // ---------- Helpers ---------- function toEntityIri(name: string): Term { const slug = encodeURIComponent(name.toLowerCase().replace(/\s+/g, "-")); return { type: "IRI", iri: `http://trustgraph.ai/e/${slug}`, }; } function iriTerm(iri: string): Term { return { type: "IRI", iri }; } function literalTerm(value: string): Term { return { type: "LITERAL", value }; } /** * Parse JSON from LLM output, handling markdown code fences and malformed output. * Uses progressive fallback: direct parse, array extraction, truncated array repair, single object wrap. */ function parseJsonResponse(raw: string): T | null { // Attempt 1: direct parse after stripping fences let cleaned = raw.trim(); const fenceMatch = cleaned.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?```$/); if (fenceMatch) { cleaned = fenceMatch[1].trim(); } try { return JSON.parse(cleaned) as T; } catch { /* fall through */ } // Attempt 2: extract first JSON array from the text const arrayMatch = cleaned.match(/\[[\s\S]*\]/); if (arrayMatch) { try { return JSON.parse(arrayMatch[0]) as T; } catch { /* fall through */ } // Attempt 3: try to fix truncated array by closing it after the last complete object const partial = arrayMatch[0]; const lastBrace = partial.lastIndexOf('}'); if (lastBrace > 0) { const truncated = partial.slice(0, lastBrace + 1) + ']'; try { return JSON.parse(truncated) as T; } catch { /* fall through */ } } } // Attempt 4: extract first JSON object, wrap in array const objMatch = cleaned.match(/\{[\s\S]*?\}/); if (objMatch) { try { const obj = JSON.parse(objMatch[0]); return [obj] as unknown as T; } catch { /* fall through */ } } console.warn("[KnowledgeExtract] Failed to parse JSON from LLM response:", raw.slice(0, 300)); return null; } export async function run(): Promise { await KnowledgeExtractService.launch("knowledge-extract"); }