trustgraph/ts/packages/flow/src/extract/knowledge-extract.ts

/**
 * Knowledge extraction service — extracts relationships and definitions from text chunks.
 *
 * A FlowProcessor that:
 * 1. Consumes Chunk messages
 * 2. Uses prompt service + LLM to extract relationships and definitions
 * 3. Converts extractions into RDF triples and entity contexts
 * 4. Emits Triples and EntityContexts messages
 *
 * Python reference: trustgraph-flow/trustgraph/extract/knowledge/service.py
 */

import {
  FlowProcessor,
  ConsumerSpec,
  ProducerSpec,
  RequestResponseSpec,
  type ProcessorConfig,
  type FlowContext,
  type Chunk,
  type Triples,
  type EntityContexts,
  type EntityContext,
  type PromptRequest,
  type PromptResponse,
  type TextCompletionRequest,
  type TextCompletionResponse,
  type Triple,
  type Term,
} from "@trustgraph/base";

// Well-known RDF/SKOS IRIs
const RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label";
const SKOS_DEFINITION = "http://www.w3.org/2004/02/skos/core#definition";

interface ExtractedRelationship {
  subject: string;
  predicate: string;
  object: string;
}

interface ExtractedDefinition {
  entity: string;
  definition: string;
}

export class KnowledgeExtractService extends FlowProcessor {
  constructor(config: ProcessorConfig) {
    super(config);

    this.registerSpecification(
      new ConsumerSpec<Chunk>("input", this.onMessage.bind(this)),
    );
    this.registerSpecification(new ProducerSpec<Triples>("triples"));
    this.registerSpecification(new ProducerSpec<EntityContexts>("entity-contexts"));

    this.registerSpecification(
      new RequestResponseSpec<PromptRequest, PromptResponse>(
        "prompt-client",
        "prompt-request",
        "prompt-response",
      ),
    );
    this.registerSpecification(
      new RequestResponseSpec<TextCompletionRequest, TextCompletionResponse>(
        "llm-client",
        "text-completion-request",
        "text-completion-response",
      ),
    );

    console.log("[KnowledgeExtract] Service initialized");
  }

  private async onMessage(
    msg: Chunk,
    properties: Record<string, string>,
    flowCtx: FlowContext,
  ): Promise<void> {
    const requestId = properties.id;
    if (!requestId) return;

    const text = msg.chunk;
    if (!text || text.trim().length === 0) return;

    const promptClient = flowCtx.flow.requestor<PromptRequest, PromptResponse>("prompt-client");
    const llmClient = flowCtx.flow.requestor<TextCompletionRequest, TextCompletionResponse>("llm-client");
    const triplesProducer = flowCtx.flow.producer<Triples>("triples");
    const entityContextsProducer = flowCtx.flow.producer<EntityContexts>("entity-contexts");

    const allTriples: Triple[] = [];
    const allEntityContexts: EntityContext[] = [];

    // --- Extract relationships ---
    try {
      const relPrompt = await promptClient.request({
        name: "extract-relationships",
        variables: { text },
      });

      if (!relPrompt.error) {
        const relCompletion = await llmClient.request({
          system: relPrompt.system,
          prompt: relPrompt.prompt,
        });

        if (!relCompletion.error && relCompletion.response) {
          const relationships = parseJsonResponse<ExtractedRelationship[]>(relCompletion.response);

          if (relationships) {
            for (const rel of relationships) {
              if (!rel.subject || !rel.predicate || !rel.object) continue;

              const subjectIri = toEntityIri(rel.subject);
              const predicateIri = toEntityIri(rel.predicate);
              const objectIri = toEntityIri(rel.object);

              // Main relationship triple
              allTriples.push({ s: subjectIri, p: predicateIri, o: objectIri });

              // rdfs:label triples for each entity
              allTriples.push({
                s: subjectIri,
                p: iriTerm(RDFS_LABEL),
                o: literalTerm(rel.subject),
              });
              allTriples.push({
                s: predicateIri,
                p: iriTerm(RDFS_LABEL),
                o: literalTerm(rel.predicate),
              });
              allTriples.push({
                s: objectIri,
                p: iriTerm(RDFS_LABEL),
                o: literalTerm(rel.object),
              });

              // Entity contexts for subject and object
              allEntityContexts.push({
                entity: subjectIri,
                context: text,
                chunkId: msg.documentId,
              });
              allEntityContexts.push({
                entity: objectIri,
                context: text,
                chunkId: msg.documentId,
              });
            }

            console.log(`[KnowledgeExtract] Extracted ${relationships.length} relationships`);
          }
        }
      }
    } catch (err) {
      console.error("[KnowledgeExtract] Relationship extraction failed:", err);
    }

    // --- Extract definitions ---
    try {
      const defPrompt = await promptClient.request({
        name: "extract-definitions",
        variables: { text },
      });

      if (!defPrompt.error) {
        const defCompletion = await llmClient.request({
          system: defPrompt.system,
          prompt: defPrompt.prompt,
        });

        if (!defCompletion.error && defCompletion.response) {
          const definitions = parseJsonResponse<ExtractedDefinition[]>(defCompletion.response);

          if (definitions) {
            for (const def of definitions) {
              if (!def.entity || !def.definition) continue;

              const entityIri = toEntityIri(def.entity);

              // Definition triple
              allTriples.push({
                s: entityIri,
                p: iriTerm(SKOS_DEFINITION),
                o: literalTerm(def.definition),
              });

              // Label triple
              allTriples.push({
                s: entityIri,
                p: iriTerm(RDFS_LABEL),
                o: literalTerm(def.entity),
              });

              // Entity context
              allEntityContexts.push({
                entity: entityIri,
                context: text,
                chunkId: msg.documentId,
              });
            }

            console.log(`[KnowledgeExtract] Extracted ${definitions.length} definitions`);
          }
        }
      }
    } catch (err) {
      console.error("[KnowledgeExtract] Definition extraction failed:", err);
    }

    // --- Emit results ---
    if (allTriples.length > 0) {
      await triplesProducer.send(requestId, {
        metadata: msg.metadata,
        triples: allTriples,
      });
    }

    if (allEntityContexts.length > 0) {
      await entityContextsProducer.send(requestId, {
        metadata: msg.metadata,
        entities: allEntityContexts,
      });
    }
  }
}

// ---------- Helpers ----------

function toEntityIri(name: string): Term {
  const slug = encodeURIComponent(name.toLowerCase().replace(/\s+/g, "-"));
  return {
    type: "IRI",
    iri: `http://trustgraph.ai/e/${slug}`,
  };
}

function iriTerm(iri: string): Term {
  return { type: "IRI", iri };
}

function literalTerm(value: string): Term {
  return { type: "LITERAL", value };
}

/**
 * Parse JSON from LLM output, handling markdown code fences and malformed output.
 */
function parseJsonResponse<T>(raw: string): T | null {
  try {
    // Strip markdown code fences
    let cleaned = raw.trim();

    // Remove ```json ... ``` or ``` ... ```
    const fenceMatch = cleaned.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?```$/);
    if (fenceMatch) {
      cleaned = fenceMatch[1].trim();
    }

    return JSON.parse(cleaned) as T;
  } catch {
    console.warn("[KnowledgeExtract] Failed to parse JSON from LLM response:", raw.slice(0, 200));
    return null;
  }
}

export async function run(): Promise<void> {
  await KnowledgeExtractService.launch("knowledge-extract");
}
feat: add document pipeline, ReAct agent, and knowledge core services Document Pipeline (Team A): - LibrarianService: document storage with filesystem backend, metadata persistence, child document hierarchy, collection management - ChunkingService: recursive character text splitter with configurable chunk size/overlap, FlowProcessor pattern - KnowledgeExtractService: combined relationship + definition extraction using prompt service and LLM, emits RDF triples and entity contexts - KnowledgeCoreService: knowledge core CRUD with streaming export and flow-based loading ReAct Agent (Team B): - StreamingReActParser: state machine for parsing LLM output into Thought/Action/ActionInput/FinalAnswer sections - Three MVP tools: KnowledgeQuery (GraphRAG), DocumentQuery (DocRAG), TriplesQuery with RequestResponse clients - AgentService FlowProcessor with ReAct loop, tool execution, and streaming chunk responses (thought/observation/answer) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-06 00:19:37 -05:00			`/**`
			`* Knowledge extraction service — extracts relationships and definitions from text chunks.`
			`*`
			`* A FlowProcessor that:`
			`* 1. Consumes Chunk messages`
			`* 2. Uses prompt service + LLM to extract relationships and definitions`
			`* 3. Converts extractions into RDF triples and entity contexts`
			`* 4. Emits Triples and EntityContexts messages`
			`*`
			`* Python reference: trustgraph-flow/trustgraph/extract/knowledge/service.py`
			`*/`

			`import {`
			`FlowProcessor,`
			`ConsumerSpec,`
			`ProducerSpec,`
			`RequestResponseSpec,`
			`type ProcessorConfig,`
			`type FlowContext,`
			`type Chunk,`
			`type Triples,`
			`type EntityContexts,`
			`type EntityContext,`
			`type PromptRequest,`
			`type PromptResponse,`
			`type TextCompletionRequest,`
			`type TextCompletionResponse,`
			`type Triple,`
			`type Term,`
			`} from "@trustgraph/base";`

			`// Well-known RDF/SKOS IRIs`
			`const RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label";`
			`const SKOS_DEFINITION = "http://www.w3.org/2004/02/skos/core#definition";`

			`interface ExtractedRelationship {`
			`subject: string;`
			`predicate: string;`
			`object: string;`
			`}`

			`interface ExtractedDefinition {`
			`entity: string;`
			`definition: string;`
			`}`

			`export class KnowledgeExtractService extends FlowProcessor {`
			`constructor(config: ProcessorConfig) {`
			`super(config);`

			`this.registerSpecification(`
			`new ConsumerSpec<Chunk>("input", this.onMessage.bind(this)),`
			`);`
			`this.registerSpecification(new ProducerSpec<Triples>("triples"));`
			`this.registerSpecification(new ProducerSpec<EntityContexts>("entity-contexts"));`

			`this.registerSpecification(`
			`new RequestResponseSpec<PromptRequest, PromptResponse>(`
			`"prompt-client",`
			`"prompt-request",`
			`"prompt-response",`
			`),`
			`);`
			`this.registerSpecification(`
			`new RequestResponseSpec<TextCompletionRequest, TextCompletionResponse>(`
			`"llm-client",`
			`"text-completion-request",`
			`"text-completion-response",`
			`),`
			`);`

			`console.log("[KnowledgeExtract] Service initialized");`
			`}`

			`private async onMessage(`
			`msg: Chunk,`
			`properties: Record<string, string>,`
			`flowCtx: FlowContext,`
			`): Promise<void> {`
			`const requestId = properties.id;`
			`if (!requestId) return;`

			`const text = msg.chunk;`
			`if (!text \|\| text.trim().length === 0) return;`

			`const promptClient = flowCtx.flow.requestor<PromptRequest, PromptResponse>("prompt-client");`
			`const llmClient = flowCtx.flow.requestor<TextCompletionRequest, TextCompletionResponse>("llm-client");`
			`const triplesProducer = flowCtx.flow.producer<Triples>("triples");`
			`const entityContextsProducer = flowCtx.flow.producer<EntityContexts>("entity-contexts");`

			`const allTriples: Triple[] = [];`
			`const allEntityContexts: EntityContext[] = [];`

			`// --- Extract relationships ---`
			`try {`
			`const relPrompt = await promptClient.request({`
			`name: "extract-relationships",`
			`variables: { text },`
			`});`

			`if (!relPrompt.error) {`
			`const relCompletion = await llmClient.request({`
			`system: relPrompt.system,`
			`prompt: relPrompt.prompt,`
			`});`

			`if (!relCompletion.error && relCompletion.response) {`
			`const relationships = parseJsonResponse<ExtractedRelationship[]>(relCompletion.response);`

			`if (relationships) {`
			`for (const rel of relationships) {`
			`if (!rel.subject \|\| !rel.predicate \|\| !rel.object) continue;`

			`const subjectIri = toEntityIri(rel.subject);`
			`const predicateIri = toEntityIri(rel.predicate);`
			`const objectIri = toEntityIri(rel.object);`

			`// Main relationship triple`
			`allTriples.push({ s: subjectIri, p: predicateIri, o: objectIri });`

			`// rdfs:label triples for each entity`
			`allTriples.push({`
			`s: subjectIri,`
			`p: iriTerm(RDFS_LABEL),`
			`o: literalTerm(rel.subject),`
			`});`
			`allTriples.push({`
			`s: predicateIri,`
			`p: iriTerm(RDFS_LABEL),`
			`o: literalTerm(rel.predicate),`
			`});`
			`allTriples.push({`
			`s: objectIri,`
			`p: iriTerm(RDFS_LABEL),`
			`o: literalTerm(rel.object),`
			`});`

			`// Entity contexts for subject and object`
			`allEntityContexts.push({`
			`entity: subjectIri,`
			`context: text,`
			`chunkId: msg.documentId,`
			`});`
			`allEntityContexts.push({`
			`entity: objectIri,`
			`context: text,`
			`chunkId: msg.documentId,`
			`});`
			`}`

			console.log(`[KnowledgeExtract] Extracted ${relationships.length} relationships`);
			`}`
			`}`
			`}`
			`} catch (err) {`
			`console.error("[KnowledgeExtract] Relationship extraction failed:", err);`
			`}`

			`// --- Extract definitions ---`
			`try {`
			`const defPrompt = await promptClient.request({`
			`name: "extract-definitions",`
			`variables: { text },`
			`});`

			`if (!defPrompt.error) {`
			`const defCompletion = await llmClient.request({`
			`system: defPrompt.system,`
			`prompt: defPrompt.prompt,`
			`});`

			`if (!defCompletion.error && defCompletion.response) {`
			`const definitions = parseJsonResponse<ExtractedDefinition[]>(defCompletion.response);`

			`if (definitions) {`
			`for (const def of definitions) {`
			`if (!def.entity \|\| !def.definition) continue;`

			`const entityIri = toEntityIri(def.entity);`

			`// Definition triple`
			`allTriples.push({`
			`s: entityIri,`
			`p: iriTerm(SKOS_DEFINITION),`
			`o: literalTerm(def.definition),`
			`});`

			`// Label triple`
			`allTriples.push({`
			`s: entityIri,`
			`p: iriTerm(RDFS_LABEL),`
			`o: literalTerm(def.entity),`
			`});`

			`// Entity context`
			`allEntityContexts.push({`
			`entity: entityIri,`
			`context: text,`
			`chunkId: msg.documentId,`
			`});`
			`}`

			console.log(`[KnowledgeExtract] Extracted ${definitions.length} definitions`);
			`}`
			`}`
			`}`
			`} catch (err) {`
			`console.error("[KnowledgeExtract] Definition extraction failed:", err);`
			`}`

			`// --- Emit results ---`
			`if (allTriples.length > 0) {`
			`await triplesProducer.send(requestId, {`
			`metadata: msg.metadata,`
			`triples: allTriples,`
			`});`
			`}`

			`if (allEntityContexts.length > 0) {`
			`await entityContextsProducer.send(requestId, {`
			`metadata: msg.metadata,`
			`entities: allEntityContexts,`
			`});`
			`}`
			`}`
			`}`

			`// ---------- Helpers ----------`

			`function toEntityIri(name: string): Term {`
			`const slug = encodeURIComponent(name.toLowerCase().replace(/\s+/g, "-"));`
			`return {`
			`type: "IRI",`
			iri: `http://trustgraph.ai/e/${slug}`,
			`};`
			`}`

			`function iriTerm(iri: string): Term {`
			`return { type: "IRI", iri };`
			`}`

			`function literalTerm(value: string): Term {`
			`return { type: "LITERAL", value };`
			`}`

			`/**`
			`* Parse JSON from LLM output, handling markdown code fences and malformed output.`
			`*/`
			`function parseJsonResponse<T>(raw: string): T \| null {`
			`try {`
			`// Strip markdown code fences`
			`let cleaned = raw.trim();`

			// Remove ```json ... ``` or ``` ... ```
			const fenceMatch = cleaned.match(/^```(?:json)?\s\n?([\s\S]?)\n?```$/);
			`if (fenceMatch) {`
			`cleaned = fenceMatch[1].trim();`
			`}`

			`return JSON.parse(cleaned) as T;`
			`} catch {`
			`console.warn("[KnowledgeExtract] Failed to parse JSON from LLM response:", raw.slice(0, 200));`
			`return null;`
			`}`
			`}`

			`export async function run(): Promise<void> {`
			`await KnowledgeExtractService.launch("knowledge-extract");`
			`}`