Add unified explainability support and librarian storage for (#693)

Add unified explainability support and librarian storage for all retrieval engines Implements consistent explainability/provenance tracking across GraphRAG, DocumentRAG, and Agent retrieval engines. All large content (answers, thoughts, observations) is now stored in librarian rather than as inline literals in the knowledge graph. Explainability API: - New explainability.py module with entity classes (Question, Exploration, Focus, Synthesis, Analysis, Conclusion) and ExplainabilityClient - Quiescence-based eventual consistency handling for trace fetching - Content fetching from librarian with retry logic CLI updates: - tg-invoke-graph-rag -x/--explainable flag returns explain_id - tg-invoke-document-rag -x/--explainable flag returns explain_id - tg-invoke-agent -x/--explainable flag returns explain_id - tg-list-explain-traces uses new explainability API - tg-show-explain-trace handles all three trace types Agent provenance: - Records session, iterations (think/act/observe), and conclusion - Stores thoughts and observations in librarian with document references - New predicates: tg:thoughtDocument, tg:observationDocument DocumentRAG provenance: - Records question, exploration (chunk retrieval), and synthesis - Stores answers in librarian with document references Schema changes: - AgentResponse: added explain_id, explain_graph fields - RetrievalResponse: added explain_id, explain_graph fields - agent_iteration_triples: supports thought_document_id, observation_document_id Update tests.
2026-07-25 13:11:02 +02:00 · 2026-03-12 21:40:09 +00:00 · 2026-03-12 21:40:09 +00:00 · 35128ff019
commit 35128ff019
parent aecf00f040
24 changed files with 2736 additions and 846 deletions
--- a/trustgraph-flow/trustgraph/agent/react/service.py
+++ b/trustgraph-flow/trustgraph/agent/react/service.py
@ -2,6 +2,8 @@
 Simple agent infrastructure broadly implements the ReAct flow.
 """

+import asyncio
+import base64
 import json
 import re
 import sys
@ -17,9 +19,13 @@ from ... base import AgentService, TextCompletionClientSpec, PromptClientSpec
 from ... base import GraphRagClientSpec, ToolClientSpec, StructuredQueryClientSpec
 from ... base import RowEmbeddingsQueryClientSpec, EmbeddingsClientSpec
 from ... base import ProducerSpec
+from ... base import Consumer, Producer
+from ... base import ConsumerMetrics, ProducerMetrics

 from ... schema import AgentRequest, AgentResponse, AgentStep, Error
 from ... schema import Triples, Metadata
+from ... schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
+from ... schema import librarian_request_queue, librarian_response_queue

 # Provenance imports for agent explainability
 from trustgraph.provenance import (
@ -41,6 +47,8 @@ from . types import Final, Action, Tool, Argument

 default_ident = "agent-manager"
 default_max_iterations = 10
+default_librarian_request_queue = librarian_request_queue
+default_librarian_response_queue = librarian_response_queue

 class Processor(AgentService):

@ -129,6 +137,115 @@ class Processor(AgentService):
            )
        )

+        # Librarian client for storing answer content
+        librarian_request_q = params.get(
+            "librarian_request_queue", default_librarian_request_queue
+        )
+        librarian_response_q = params.get(
+            "librarian_response_queue", default_librarian_response_queue
+        )
+
+        librarian_request_metrics = ProducerMetrics(
+            processor=id, flow=None, name="librarian-request"
+        )
+
+        self.librarian_request_producer = Producer(
+            backend=self.pubsub,
+            topic=librarian_request_q,
+            schema=LibrarianRequest,
+            metrics=librarian_request_metrics,
+        )
+
+        librarian_response_metrics = ConsumerMetrics(
+            processor=id, flow=None, name="librarian-response"
+        )
+
+        self.librarian_response_consumer = Consumer(
+            taskgroup=self.taskgroup,
+            backend=self.pubsub,
+            flow=None,
+            topic=librarian_response_q,
+            subscriber=f"{id}-librarian",
+            schema=LibrarianResponse,
+            handler=self.on_librarian_response,
+            metrics=librarian_response_metrics,
+        )
+
+        # Pending librarian requests: request_id -> asyncio.Future
+        self.pending_librarian_requests = {}
+
+    async def start(self):
+        await super(Processor, self).start()
+        await self.librarian_request_producer.start()
+        await self.librarian_response_consumer.start()
+
+    async def on_librarian_response(self, msg, consumer, flow):
+        """Handle responses from the librarian service."""
+        response = msg.value()
+        request_id = msg.properties().get("id")
+
+        if request_id in self.pending_librarian_requests:
+            future = self.pending_librarian_requests.pop(request_id)
+            future.set_result(response)
+        else:
+            logger.warning(f"Received unexpected librarian response: {request_id}")
+
+    async def save_answer_content(self, doc_id, user, content, title=None, timeout=120):
+        """
+        Save answer content to the librarian.
+
+        Args:
+            doc_id: ID for the answer document
+            user: User ID
+            content: Answer text content
+            title: Optional title
+            timeout: Request timeout in seconds
+
+        Returns:
+            The document ID on success
+        """
+        request_id = str(uuid.uuid4())
+
+        doc_metadata = DocumentMetadata(
+            id=doc_id,
+            user=user,
+            kind="text/plain",
+            title=title or "Agent Answer",
+            document_type="answer",
+        )
+
+        request = LibrarianRequest(
+            operation="add-document",
+            document_id=doc_id,
+            document_metadata=doc_metadata,
+            content=base64.b64encode(content.encode("utf-8")).decode("utf-8"),
+            user=user,
+        )
+
+        # Create future for response
+        future = asyncio.get_event_loop().create_future()
+        self.pending_librarian_requests[request_id] = future
+
+        try:
+            # Send request
+            await self.librarian_request_producer.send(
+                request, properties={"id": request_id}
+            )
+
+            # Wait for response
+            response = await asyncio.wait_for(future, timeout=timeout)
+
+            if response.error:
+                raise RuntimeError(
+                    f"Librarian error saving answer: {response.error.type}: {response.error.message}"
+                )
+
+            return doc_id
+
+        except asyncio.TimeoutError:
+            self.pending_librarian_requests.pop(request_id, None)
+            raise RuntimeError(f"Timeout saving answer document {doc_id}")
+
    async def on_tools_config(self, config, version):

        logger.info(f"Loading configuration version {version}")
@ -347,6 +464,15 @@ class Processor(AgentService):
                ))
                logger.debug(f"Emitted session triples for {session_uri}")

+                # Send explain event for session
+                if streaming:
+                    await respond(AgentResponse(
+                        chunk_type="explain",
+                        content="",
+                        explain_id=session_uri,
+                        explain_graph=GRAPH_RETRIEVAL,
+                    ))
+
            logger.info(f"Question: {request.question}")

            if len(history) >= self.max_iterations:
@ -504,8 +630,28 @@ class Processor(AgentService):
                else:
                    parent_uri = session_uri

+                # Save answer to librarian
+                answer_doc_id = None
+                if f:
+                    answer_doc_id = f"urn:trustgraph:agent:{session_id}/answer"
+                    try:
+                        await self.save_answer_content(
+                            doc_id=answer_doc_id,
+                            user=request.user,
+                            content=f,
+                            title=f"Agent Answer: {request.question[:50]}...",
+                        )
+                        logger.debug(f"Saved answer to librarian: {answer_doc_id}")
+                    except Exception as e:
+                        logger.warning(f"Failed to save answer to librarian: {e}")
+                        answer_doc_id = None  # Fall back to inline content
+
                final_triples = set_graph(
-                    agent_final_triples(final_uri, parent_uri, f),
+                    agent_final_triples(
+                        final_uri, parent_uri,
+                        answer="" if answer_doc_id else f,
+                        document_id=answer_doc_id,
+                    ),
                    GRAPH_RETRIEVAL
                )
                await flow("explainability").send(Triples(
@ -518,6 +664,15 @@ class Processor(AgentService):
                ))
                logger.debug(f"Emitted final triples for {final_uri}")

+                # Send explain event for conclusion
+                if streaming:
+                    await respond(AgentResponse(
+                        chunk_type="explain",
+                        content="",
+                        explain_id=final_uri,
+                        explain_graph=GRAPH_RETRIEVAL,
+                    ))
+
                if streaming:
                    # Streaming format - send end-of-dialog marker
                    # Answer chunks were already sent via answer() callback during parsing
@ -558,14 +713,48 @@ class Processor(AgentService):
            else:
                parent_uri = session_uri

+            # Save thought to librarian
+            thought_doc_id = None
+            if act.thought:
+                thought_doc_id = f"urn:trustgraph:agent:{session_id}/i{iteration_num}/thought"
+                try:
+                    await self.save_answer_content(
+                        doc_id=thought_doc_id,
+                        user=request.user,
+                        content=act.thought,
+                        title=f"Agent Thought: {act.name}",
+                    )
+                    logger.debug(f"Saved thought to librarian: {thought_doc_id}")
+                except Exception as e:
+                    logger.warning(f"Failed to save thought to librarian: {e}")
+                    thought_doc_id = None
+
+            # Save observation to librarian
+            observation_doc_id = None
+            if act.observation:
+                observation_doc_id = f"urn:trustgraph:agent:{session_id}/i{iteration_num}/observation"
+                try:
+                    await self.save_answer_content(
+                        doc_id=observation_doc_id,
+                        user=request.user,
+                        content=act.observation,
+                        title=f"Agent Observation: {act.name}",
+                    )
+                    logger.debug(f"Saved observation to librarian: {observation_doc_id}")
+                except Exception as e:
+                    logger.warning(f"Failed to save observation to librarian: {e}")
+                    observation_doc_id = None
+
            iter_triples = set_graph(
                agent_iteration_triples(
                    iteration_uri,
                    parent_uri,
-                    act.thought,
-                    act.name,
-                    act.arguments,
-                    act.observation,
+                    thought="" if thought_doc_id else act.thought,
+                    action=act.name,
+                    arguments=act.arguments,
+                    observation="" if observation_doc_id else act.observation,
+                    thought_document_id=thought_doc_id,
+                    observation_document_id=observation_doc_id,
                ),
                GRAPH_RETRIEVAL
            )
@ -579,6 +768,15 @@ class Processor(AgentService):
            ))
            logger.debug(f"Emitted iteration triples for {iteration_uri}")

+            # Send explain event for iteration
+            if streaming:
+                await respond(AgentResponse(
+                    chunk_type="explain",
+                    content="",
+                    explain_id=iteration_uri,
+                    explain_graph=GRAPH_RETRIEVAL,
+                ))
+
            history.append(act)

            # Handle state transitions if tool execution was successful
--- a/trustgraph-flow/trustgraph/retrieval/document_rag/document_rag.py
+++ b/trustgraph-flow/trustgraph/retrieval/document_rag/document_rag.py
@ -109,7 +109,7 @@ class DocumentRag:
    async def query(
            self, query, user="trustgraph", collection="default",
            doc_limit=20, streaming=False, chunk_callback=None,
-            explain_callback=None,
+            explain_callback=None, save_answer_callback=None,
    ):
        """
        Execute a Document RAG query with optional explainability tracking.
@ -122,6 +122,7 @@ class DocumentRag:
            streaming: Enable streaming LLM response
            chunk_callback: async def callback(chunk, end_of_stream) for streaming
            explain_callback: async def callback(triples, explain_id) for explainability
+            save_answer_callback: async def callback(doc_id, answer_text) to save answer to librarian

        Returns:
            str: The synthesized answer text
@ -192,9 +193,28 @@ class DocumentRag:

        # Emit synthesis explainability after answer generated
        if explain_callback:
+            synthesis_doc_id = None
            answer_text = resp if resp else ""
+
+            # Save answer to librarian if callback provided
+            if save_answer_callback and answer_text:
+                # Generate document ID as URN matching query-time provenance format
+                synthesis_doc_id = f"urn:trustgraph:docrag:{session_id}/answer"
+                try:
+                    await save_answer_callback(synthesis_doc_id, answer_text)
+                    if self.verbose:
+                        logger.debug(f"Saved answer to librarian: {synthesis_doc_id}")
+                except Exception as e:
+                    logger.warning(f"Failed to save answer to librarian: {e}")
+                    synthesis_doc_id = None  # Fall back to inline content
+
+            # Generate triples with document reference or inline content
            syn_triples = set_graph(
-                docrag_synthesis_triples(syn_uri, exp_uri, answer_text),
+                docrag_synthesis_triples(
+                    syn_uri, exp_uri,
+                    answer_text="" if synthesis_doc_id else answer_text,
+                    document_id=synthesis_doc_id,
+                ),
                GRAPH_RETRIEVAL
            )
            await explain_callback(syn_triples, syn_uri)
--- a/trustgraph-flow/trustgraph/retrieval/document_rag/rag.py
+++ b/trustgraph-flow/trustgraph/retrieval/document_rag/rag.py
@ -8,8 +8,10 @@ import asyncio
 import base64
 import logging

+import uuid
+
 from ... schema import DocumentRagQuery, DocumentRagResponse, Error
-from ... schema import LibrarianRequest, LibrarianResponse
+from ... schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
 from ... schema import librarian_request_queue, librarian_response_queue
 from ... schema import Triples, Metadata
 from ... provenance import GRAPH_RETRIEVAL
@ -179,6 +181,62 @@ class Processor(FlowProcessor):
            self.pending_requests.pop(request_id, None)
            raise RuntimeError(f"Timeout fetching chunk {chunk_id}")

+    async def save_answer_content(self, doc_id, user, content, title=None, timeout=120):
+        """
+        Save answer content to the librarian.
+
+        Args:
+            doc_id: ID for the answer document
+            user: User ID
+            content: Answer text content
+            title: Optional title
+            timeout: Request timeout in seconds
+
+        Returns:
+            The document ID on success
+        """
+        request_id = str(uuid.uuid4())
+
+        doc_metadata = DocumentMetadata(
+            id=doc_id,
+            user=user,
+            kind="text/plain",
+            title=title or "DocumentRAG Answer",
+            document_type="answer",
+        )
+
+        request = LibrarianRequest(
+            operation="add-document",
+            document_id=doc_id,
+            document_metadata=doc_metadata,
+            content=base64.b64encode(content.encode("utf-8")).decode("utf-8"),
+            user=user,
+        )
+
+        # Create future for response
+        future = asyncio.get_event_loop().create_future()
+        self.pending_requests[request_id] = future
+
+        try:
+            # Send request
+            await self.librarian_request_producer.send(
+                request, properties={"id": request_id}
+            )
+
+            # Wait for response
+            response = await asyncio.wait_for(future, timeout=timeout)
+
+            if response.error:
+                raise RuntimeError(
+                    f"Librarian error saving answer: {response.error.type}: {response.error.message}"
+                )
+
+            return doc_id
+
+        except asyncio.TimeoutError:
+            self.pending_requests.pop(request_id, None)
+            raise RuntimeError(f"Timeout saving answer document {doc_id}")
+
    async def on_request(self, msg, consumer, flow):

        try:
@ -222,10 +280,20 @@ class Processor(FlowProcessor):
                        response=None,
                        explain_id=explain_id,
                        explain_graph=GRAPH_RETRIEVAL,
+                        message_type="explain",
                    ),
                    properties={"id": id}
                )

+            # Callback to save answer content to librarian
+            async def save_answer(doc_id, answer_text):
+                await self.save_answer_content(
+                    doc_id=doc_id,
+                    user=v.user,
+                    content=answer_text,
+                    title=f"DocumentRAG Answer: {v.query[:50]}...",
+                )
+
            # Check if streaming is requested
            if v.streaming:
                # Define async callback for streaming chunks
@ -235,6 +303,7 @@ class Processor(FlowProcessor):
                        DocumentRagResponse(
                            response=chunk,
                            end_of_stream=end_of_stream,
+                            message_type="chunk",
                            error=None
                        ),
                        properties={"id": id}
@ -250,6 +319,17 @@ class Processor(FlowProcessor):
                    streaming=True,
                    chunk_callback=send_chunk,
                    explain_callback=send_explainability,
+                    save_answer_callback=save_answer,
+                )
+
+                # Send end_of_session to signal entire session is complete
+                await flow("response").send(
+                    DocumentRagResponse(
+                        response=None,
+                        end_of_session=True,
+                        message_type="end",
+                    ),
+                    properties={"id": id}
                )
            else:
                # Non-streaming path (existing behavior)
@ -259,6 +339,7 @@ class Processor(FlowProcessor):
                    collection=v.collection,
                    doc_limit=doc_limit,
                    explain_callback=send_explainability,
+                    save_answer_callback=save_answer,
                )

                await flow("response").send(