Expose LLM token usage across all service layers (#782)

Expose LLM token usage (in_token, out_token, model) across all service layers Propagate token counts from LLM services through the prompt, text-completion, graph-RAG, document-RAG, and agent orchestrator pipelines to the API gateway and Python SDK. All fields are Optional — None means "not available", distinguishing from a real zero count. Key changes: - Schema: Add in_token/out_token/model to TextCompletionResponse, PromptResponse, GraphRagResponse, DocumentRagResponse, AgentResponse - TextCompletionClient: New TextCompletionResult return type. Split into text_completion() (non-streaming) and text_completion_stream() (streaming with per-chunk handler callback) - PromptClient: New PromptResult with response_type (text/json/jsonl), typed fields (text/object/objects), and token usage. All callers updated. - RAG services: Accumulate token usage across all prompt calls (extract-concepts, edge-scoring, edge-reasoning, synthesis). Non-streaming path sends single combined response instead of chunk + end_of_session. - Agent orchestrator: UsageTracker accumulates tokens across meta-router, pattern prompt calls, and react reasoning. Attached to end_of_dialog. - Translators: Encode token fields when not None (is not None, not truthy) - Python SDK: RAG and text-completion methods return TextCompletionResult (non-streaming) or RAGChunk/AgentAnswer with token fields (streaming) - CLI: --show-usage flag on tg-invoke-llm, tg-invoke-prompt, tg-invoke-graph-rag, tg-invoke-document-rag, tg-invoke-agent
2026-06-21 20:58:06 +02:00 · 2026-04-13 14:38:34 +01:00 · 2026-04-13 14:38:34 +01:00 · 14e49d83c7
commit 14e49d83c7
parent 67cfa80836
60 changed files with 1252 additions and 577 deletions
--- a/trustgraph-base/trustgraph/api/init.py
+++ b/trustgraph-base/trustgraph/api/init.py
@ -107,6 +107,7 @@ from .types import (
    AgentObservation,
    AgentAnswer,
    RAGChunk,
+    TextCompletionResult,
    ProvenanceEvent,
 )

@ -185,6 +186,7 @@ __all__ = [
    "AgentObservation",
    "AgentAnswer",
    "RAGChunk",
+    "TextCompletionResult",
    "ProvenanceEvent",

    # Exceptions
--- a/trustgraph-base/trustgraph/api/async_flow.py
+++ b/trustgraph-base/trustgraph/api/async_flow.py
@ -14,6 +14,8 @@ import aiohttp
 import json
 from typing import Optional, Dict, Any, List

+from . types import TextCompletionResult
+
 from . exceptions import ProtocolException, ApplicationException


@ -434,12 +436,11 @@ class AsyncFlowInstance:

        return await self.request("agent", request_data)

-    async def text_completion(self, system: str, prompt: str, **kwargs: Any) -> str:
+    async def text_completion(self, system: str, prompt: str, **kwargs: Any) -> TextCompletionResult:
        """
        Generate text completion (non-streaming).

        Generates a text response from an LLM given a system prompt and user prompt.
-        Returns the complete response text.

        Note: This method does not support streaming. For streaming text generation,
        use AsyncSocketFlowInstance.text_completion() instead.
@ -450,19 +451,19 @@ class AsyncFlowInstance:
            **kwargs: Additional service-specific parameters

        Returns:
-            str: Complete generated text response
+            TextCompletionResult: Result with text, in_token, out_token, model

        Example:
            ```python
            async_flow = await api.async_flow()
            flow = async_flow.id("default")

-            # Generate text
-            response = await flow.text_completion(
+            result = await flow.text_completion(
                system="You are a helpful assistant.",
                prompt="Explain quantum computing in simple terms."
            )
-            print(response)
+            print(result.text)
+            print(f"Tokens: {result.in_token} in, {result.out_token} out")
            ```
        """
        request_data = {
@ -473,7 +474,12 @@ class AsyncFlowInstance:
        request_data.update(kwargs)

        result = await self.request("text-completion", request_data)
-        return result.get("response", "")
+        return TextCompletionResult(
+            text=result.get("response", ""),
+            in_token=result.get("in_token"),
+            out_token=result.get("out_token"),
+            model=result.get("model"),
+        )

    async def graph_rag(self, query: str, user: str, collection: str,
                        max_subgraph_size: int = 1000, max_subgraph_count: int = 5,
--- a/trustgraph-base/trustgraph/api/async_socket_client.py
+++ b/trustgraph-base/trustgraph/api/async_socket_client.py
@ -4,7 +4,7 @@ import asyncio
 import websockets
 from typing import Optional, Dict, Any, AsyncIterator, Union

-from . types import AgentThought, AgentObservation, AgentAnswer, RAGChunk
+from . types import AgentThought, AgentObservation, AgentAnswer, RAGChunk, TextCompletionResult
 from . exceptions import ProtocolException, ApplicationException


@ -199,7 +199,10 @@ class AsyncSocketClient:
            return AgentAnswer(
                content=resp.get("content", ""),
                end_of_message=resp.get("end_of_message", False),
-                end_of_dialog=resp.get("end_of_dialog", False)
+                end_of_dialog=resp.get("end_of_dialog", False),
+                in_token=resp.get("in_token"),
+                out_token=resp.get("out_token"),
+                model=resp.get("model"),
            )
        elif chunk_type == "action":
            return AgentThought(
@ -211,7 +214,10 @@ class AsyncSocketClient:
            return RAGChunk(
                content=content,
                end_of_stream=resp.get("end_of_stream", False),
-                error=None
+                error=None,
+                in_token=resp.get("in_token"),
+                out_token=resp.get("out_token"),
+                model=resp.get("model"),
            )

    async def aclose(self):
@ -269,7 +275,11 @@ class AsyncSocketFlowInstance:
            return await self.client._send_request("agent", self.flow_id, request)

    async def text_completion(self, system: str, prompt: str, streaming: bool = False, **kwargs):
-        """Text completion with optional streaming"""
+        """Text completion with optional streaming.
+
+        Non-streaming: returns a TextCompletionResult with text and token counts.
+        Streaming: returns an async iterator of RAGChunk (with token counts on the final chunk).
+        """
        request = {
            "system": system,
            "prompt": prompt,
@ -281,13 +291,18 @@ class AsyncSocketFlowInstance:
            return self._text_completion_streaming(request)
        else:
            result = await self.client._send_request("text-completion", self.flow_id, request)
-            return result.get("response", "")
+            return TextCompletionResult(
+                text=result.get("response", ""),
+                in_token=result.get("in_token"),
+                out_token=result.get("out_token"),
+                model=result.get("model"),
+            )

    async def _text_completion_streaming(self, request):
-        """Helper for streaming text completion"""
+        """Helper for streaming text completion. Yields RAGChunk objects."""
        async for chunk in self.client._send_request_streaming("text-completion", self.flow_id, request):
-            if hasattr(chunk, 'content'):
-                yield chunk.content
+            if isinstance(chunk, RAGChunk):
+                yield chunk

    async def graph_rag(self, query: str, user: str, collection: str,
                        max_subgraph_size: int = 1000, max_subgraph_count: int = 5,
--- a/trustgraph-base/trustgraph/api/flow.py
+++ b/trustgraph-base/trustgraph/api/flow.py
@ -11,7 +11,7 @@ import base64

 from .. knowledge import hash, Uri, Literal, QuotedTriple
 from .. schema import IRI, LITERAL, TRIPLE
-from . types import Triple
+from . types import Triple, TextCompletionResult
 from . exceptions import ProtocolException


@ -360,16 +360,17 @@ class FlowInstance:
            prompt: User prompt/question

        Returns:
-            str: Generated response text
+            TextCompletionResult: Result with text, in_token, out_token, model

        Example:
            ```python
            flow = api.flow().id("default")
-            response = flow.text_completion(
+            result = flow.text_completion(
                system="You are a helpful assistant",
                prompt="What is quantum computing?"
            )
-            print(response)
+            print(result.text)
+            print(f"Tokens: {result.in_token} in, {result.out_token} out")
            ```
        """

@ -379,10 +380,17 @@ class FlowInstance:
            "prompt": prompt
        }

-        return self.request(
+        result = self.request(
            "service/text-completion",
            input
-        )["response"]
+        )
+
+        return TextCompletionResult(
+            text=result.get("response", ""),
+            in_token=result.get("in_token"),
+            out_token=result.get("out_token"),
+            model=result.get("model"),
+        )

    def agent(self, question, user="trustgraph", state=None, group=None, history=None):
        """
@ -498,10 +506,17 @@ class FlowInstance:
            "edge-limit": edge_limit,
        }

-        return self.request(
+        result = self.request(
            "service/graph-rag",
            input
-        )["response"]
+        )
+
+        return TextCompletionResult(
+            text=result.get("response", ""),
+            in_token=result.get("in_token"),
+            out_token=result.get("out_token"),
+            model=result.get("model"),
+        )

    def document_rag(
            self, query, user="trustgraph", collection="default",
@ -543,10 +558,17 @@ class FlowInstance:
            "doc-limit": doc_limit,
        }

-        return self.request(
+        result = self.request(
            "service/document-rag",
            input
-        )["response"]
+        )
+
+        return TextCompletionResult(
+            text=result.get("response", ""),
+            in_token=result.get("in_token"),
+            out_token=result.get("out_token"),
+            model=result.get("model"),
+        )

    def embeddings(self, texts):
        """
--- a/trustgraph-base/trustgraph/api/socket_client.py
+++ b/trustgraph-base/trustgraph/api/socket_client.py
@ -14,7 +14,7 @@ import websockets
 from typing import Optional, Dict, Any, Iterator, Union, List
 from threading import Lock

-from . types import AgentThought, AgentObservation, AgentAnswer, RAGChunk, StreamingChunk, ProvenanceEvent
+from . types import AgentThought, AgentObservation, AgentAnswer, RAGChunk, StreamingChunk, ProvenanceEvent, TextCompletionResult
 from . exceptions import ProtocolException, raise_from_error_dict


@ -393,6 +393,9 @@ class SocketClient:
                end_of_message=resp.get("end_of_message", False),
                end_of_dialog=resp.get("end_of_dialog", False),
                message_id=resp.get("message_id", ""),
+                in_token=resp.get("in_token"),
+                out_token=resp.get("out_token"),
+                model=resp.get("model"),
            )
        elif chunk_type == "action":
            return AgentThought(
@ -404,7 +407,10 @@ class SocketClient:
            return RAGChunk(
                content=content,
                end_of_stream=resp.get("end_of_stream", False),
-                error=None
+                error=None,
+                in_token=resp.get("in_token"),
+                out_token=resp.get("out_token"),
+                model=resp.get("model"),
            )

    def _build_provenance_event(self, resp: Dict[str, Any]) -> ProvenanceEvent:
@ -543,8 +549,12 @@ class SocketFlowInstance:
            streaming=True, include_provenance=True
        )

-    def text_completion(self, system: str, prompt: str, streaming: bool = False, **kwargs) -> Union[str, Iterator[str]]:
-        """Execute text completion with optional streaming."""
+    def text_completion(self, system: str, prompt: str, streaming: bool = False, **kwargs) -> Union[TextCompletionResult, Iterator[RAGChunk]]:
+        """Execute text completion with optional streaming.
+
+        Non-streaming: returns a TextCompletionResult with text and token counts.
+        Streaming: returns an iterator of RAGChunk (with token counts on the final chunk).
+        """
        request = {
            "system": system,
            "prompt": prompt,
@ -557,12 +567,17 @@ class SocketFlowInstance:
        if streaming:
            return self._text_completion_generator(result)
        else:
-            return result.get("response", "")
+            return TextCompletionResult(
+                text=result.get("response", ""),
+                in_token=result.get("in_token"),
+                out_token=result.get("out_token"),
+                model=result.get("model"),
+            )

-    def _text_completion_generator(self, result: Iterator[StreamingChunk]) -> Iterator[str]:
+    def _text_completion_generator(self, result: Iterator[StreamingChunk]) -> Iterator[RAGChunk]:
        for chunk in result:
-            if hasattr(chunk, 'content'):
-                yield chunk.content
+            if isinstance(chunk, RAGChunk):
+                yield chunk

    def graph_rag(
        self,
@ -577,8 +592,12 @@ class SocketFlowInstance:
        edge_limit: int = 25,
        streaming: bool = False,
        **kwargs: Any
-    ) -> Union[str, Iterator[str]]:
-        """Execute graph-based RAG query with optional streaming."""
+    ) -> Union[TextCompletionResult, Iterator[RAGChunk]]:
+        """Execute graph-based RAG query with optional streaming.
+
+        Non-streaming: returns a TextCompletionResult with text and token counts.
+        Streaming: returns an iterator of RAGChunk (with token counts on the final chunk).
+        """
        request = {
            "query": query,
            "user": user,
@ -598,7 +617,12 @@ class SocketFlowInstance:
        if streaming:
            return self._rag_generator(result)
        else:
-            return result.get("response", "")
+            return TextCompletionResult(
+                text=result.get("response", ""),
+                in_token=result.get("in_token"),
+                out_token=result.get("out_token"),
+                model=result.get("model"),
+            )

    def graph_rag_explain(
        self,
@ -642,8 +666,12 @@ class SocketFlowInstance:
        doc_limit: int = 10,
        streaming: bool = False,
        **kwargs: Any
-    ) -> Union[str, Iterator[str]]:
-        """Execute document-based RAG query with optional streaming."""
+    ) -> Union[TextCompletionResult, Iterator[RAGChunk]]:
+        """Execute document-based RAG query with optional streaming.
+
+        Non-streaming: returns a TextCompletionResult with text and token counts.
+        Streaming: returns an iterator of RAGChunk (with token counts on the final chunk).
+        """
        request = {
            "query": query,
            "user": user,
@ -658,7 +686,12 @@ class SocketFlowInstance:
        if streaming:
            return self._rag_generator(result)
        else:
-            return result.get("response", "")
+            return TextCompletionResult(
+                text=result.get("response", ""),
+                in_token=result.get("in_token"),
+                out_token=result.get("out_token"),
+                model=result.get("model"),
+            )

    def document_rag_explain(
        self,
@ -684,10 +717,10 @@ class SocketFlowInstance:
            streaming=True, include_provenance=True
        )

-    def _rag_generator(self, result: Iterator[StreamingChunk]) -> Iterator[str]:
+    def _rag_generator(self, result: Iterator[StreamingChunk]) -> Iterator[RAGChunk]:
        for chunk in result:
-            if hasattr(chunk, 'content'):
-                yield chunk.content
+            if isinstance(chunk, RAGChunk):
+                yield chunk

    def prompt(
        self,
@ -695,8 +728,12 @@ class SocketFlowInstance:
        variables: Dict[str, str],
        streaming: bool = False,
        **kwargs: Any
-    ) -> Union[str, Iterator[str]]:
-        """Execute a prompt template with optional streaming."""
+    ) -> Union[TextCompletionResult, Iterator[RAGChunk]]:
+        """Execute a prompt template with optional streaming.
+
+        Non-streaming: returns a TextCompletionResult with text and token counts.
+        Streaming: returns an iterator of RAGChunk (with token counts on the final chunk).
+        """
        request = {
            "id": id,
            "variables": variables,
@ -709,7 +746,12 @@ class SocketFlowInstance:
        if streaming:
            return self._rag_generator(result)
        else:
-            return result.get("response", "")
+            return TextCompletionResult(
+                text=result.get("text", result.get("response", "")),
+                in_token=result.get("in_token"),
+                out_token=result.get("out_token"),
+                model=result.get("model"),
+            )

    def graph_embeddings_query(
        self,
--- a/trustgraph-base/trustgraph/api/types.py
+++ b/trustgraph-base/trustgraph/api/types.py
@ -189,6 +189,9 @@ class AgentAnswer(StreamingChunk):
    chunk_type: str = "final-answer"
    end_of_dialog: bool = False
    message_id: str = ""
+    in_token: Optional[int] = None
+    out_token: Optional[int] = None
+    model: Optional[str] = None

@dataclasses.dataclass
 class RAGChunk(StreamingChunk):
@ -202,11 +205,37 @@ class RAGChunk(StreamingChunk):
        content: Generated text content
        end_of_stream: True if this is the final chunk of the stream
        error: Optional error information if an error occurred
+        in_token: Input token count (populated on the final chunk, 0 otherwise)
+        out_token: Output token count (populated on the final chunk, 0 otherwise)
+        model: Model identifier (populated on the final chunk, empty otherwise)
        chunk_type: Always "rag"
    """
    chunk_type: str = "rag"
    end_of_stream: bool = False
    error: Optional[Dict[str, str]] = None
+    in_token: Optional[int] = None
+    out_token: Optional[int] = None
+    model: Optional[str] = None
+
+@dataclasses.dataclass
+class TextCompletionResult:
+    """
+    Result from a text completion request.
+
+    Returned by text_completion() in both streaming and non-streaming modes.
+    In streaming mode, text is None (chunks are delivered via the iterator).
+    In non-streaming mode, text contains the complete response.
+
+    Attributes:
+        text: Complete response text (None in streaming mode)
+        in_token: Input token count (None if not available)
+        out_token: Output token count (None if not available)
+        model: Model identifier (None if not available)
+    """
+    text: Optional[str]
+    in_token: Optional[int] = None
+    out_token: Optional[int] = None
+    model: Optional[str] = None

@dataclasses.dataclass
 class ProvenanceEvent:
--- a/trustgraph-base/trustgraph/base/init.py
+++ b/trustgraph-base/trustgraph/base/init.py
@ -18,8 +18,10 @@ from . librarian_client import LibrarianClient
 from . chunking_service import ChunkingService
 from . embeddings_service import EmbeddingsService
 from . embeddings_client import EmbeddingsClientSpec
-from . text_completion_client import TextCompletionClientSpec
-from . prompt_client import PromptClientSpec
+from . text_completion_client import (
+    TextCompletionClientSpec, TextCompletionClient, TextCompletionResult,
+)
+from . prompt_client import PromptClientSpec, PromptClient, PromptResult
 from . triples_store_service import TriplesStoreService
 from . graph_embeddings_store_service import GraphEmbeddingsStoreService
 from . document_embeddings_store_service import DocumentEmbeddingsStoreService
--- a/trustgraph-base/trustgraph/base/prompt_client.py
+++ b/trustgraph-base/trustgraph/base/prompt_client.py
@ -1,10 +1,22 @@

 import json
 import asyncio
+from dataclasses import dataclass
+from typing import Optional, Any

 from . request_response_spec import RequestResponse, RequestResponseSpec
 from .. schema import PromptRequest, PromptResponse

+@dataclass
+class PromptResult:
+    response_type: str              # "text", "json", or "jsonl"
+    text: Optional[str] = None      # populated for "text"
+    object: Any = None              # populated for "json"
+    objects: Optional[list] = None  # populated for "jsonl"
+    in_token: Optional[int] = None
+    out_token: Optional[int] = None
+    model: Optional[str] = None
+
 class PromptClient(RequestResponse):

    async def prompt(self, id, variables, timeout=600, streaming=False, chunk_callback=None):
@ -26,17 +38,40 @@ class PromptClient(RequestResponse):
            if resp.error:
                raise RuntimeError(resp.error.message)

-            if resp.text: return resp.text
+            if resp.text:
+                return PromptResult(
+                    response_type="text",
+                    text=resp.text,
+                    in_token=resp.in_token,
+                    out_token=resp.out_token,
+                    model=resp.model,
+                )

-            return json.loads(resp.object)
+            parsed = json.loads(resp.object)
+
+            if isinstance(parsed, list):
+                return PromptResult(
+                    response_type="jsonl",
+                    objects=parsed,
+                    in_token=resp.in_token,
+                    out_token=resp.out_token,
+                    model=resp.model,
+                )
+
+            return PromptResult(
+                response_type="json",
+                object=parsed,
+                in_token=resp.in_token,
+                out_token=resp.out_token,
+                model=resp.model,
+            )

        else:

-            last_text = ""
-            last_object = None
+            last_resp = None

            async def forward_chunks(resp):
-                nonlocal last_text, last_object
+                nonlocal last_resp

                if resp.error:
                    raise RuntimeError(resp.error.message)
@ -44,14 +79,13 @@ class PromptClient(RequestResponse):
                end_stream = getattr(resp, 'end_of_stream', False)

                if resp.text is not None:
-                    last_text = resp.text
                    if chunk_callback:
                        if asyncio.iscoroutinefunction(chunk_callback):
                            await chunk_callback(resp.text, end_stream)
                        else:
                            chunk_callback(resp.text, end_stream)
-                elif resp.object:
-                    last_object = resp.object
+
+                last_resp = resp

                return end_stream

@ -70,10 +104,36 @@ class PromptClient(RequestResponse):
                timeout=timeout
            )

-            if last_text:
-                return last_text
+            if last_resp is None:
+                return PromptResult(response_type="text")

-            return json.loads(last_object) if last_object else None
+            if last_resp.object:
+                parsed = json.loads(last_resp.object)
+
+                if isinstance(parsed, list):
+                    return PromptResult(
+                        response_type="jsonl",
+                        objects=parsed,
+                        in_token=last_resp.in_token,
+                        out_token=last_resp.out_token,
+                        model=last_resp.model,
+                    )
+
+                return PromptResult(
+                    response_type="json",
+                    object=parsed,
+                    in_token=last_resp.in_token,
+                    out_token=last_resp.out_token,
+                    model=last_resp.model,
+                )
+
+            return PromptResult(
+                response_type="text",
+                text=last_resp.text,
+                in_token=last_resp.in_token,
+                out_token=last_resp.out_token,
+                model=last_resp.model,
+            )

    async def extract_definitions(self, text, timeout=600):
        return await self.prompt(
@ -152,4 +212,3 @@ class PromptClientSpec(RequestResponseSpec):
            response_schema = PromptResponse,
            impl = PromptClient,
        )
-
--- a/trustgraph-base/trustgraph/base/text_completion_client.py
+++ b/trustgraph-base/trustgraph/base/text_completion_client.py
@ -1,47 +1,71 @@

+from dataclasses import dataclass
+from typing import Optional
+
 from . request_response_spec import RequestResponse, RequestResponseSpec
 from .. schema import TextCompletionRequest, TextCompletionResponse

+@dataclass
+class TextCompletionResult:
+    text: Optional[str]
+    in_token: Optional[int] = None
+    out_token: Optional[int] = None
+    model: Optional[str] = None
+
 class TextCompletionClient(RequestResponse):
-    async def text_completion(self, system, prompt, streaming=False, timeout=600):
-        # If not streaming, use original behavior
-        if not streaming:
-            resp = await self.request(
-                TextCompletionRequest(
-                    system = system, prompt = prompt, streaming = False
-                ),
-                timeout=timeout
-            )

-            if resp.error:
-                raise RuntimeError(resp.error.message)
+    async def text_completion(self, system, prompt, timeout=600):

-            return resp.response
-
-        # For streaming: collect all chunks and return complete response
-        full_response = ""
-
-        async def collect_chunks(resp):
-            nonlocal full_response
-
-            if resp.error:
-                raise RuntimeError(resp.error.message)
-
-            if resp.response:
-                full_response += resp.response
-
-            # Return True when end_of_stream is reached
-            return getattr(resp, 'end_of_stream', False)
-
-        await self.request(
+        resp = await self.request(
            TextCompletionRequest(
-                system = system, prompt = prompt, streaming = True
+                system = system, prompt = prompt, streaming = False
            ),
-            recipient=collect_chunks,
            timeout=timeout
        )

-        return full_response
+        if resp.error:
+            raise RuntimeError(resp.error.message)
+
+        return TextCompletionResult(
+            text = resp.response,
+            in_token = resp.in_token,
+            out_token = resp.out_token,
+            model = resp.model,
+        )
+
+    async def text_completion_stream(
+            self, system, prompt, handler, timeout=600,
+    ):
+        """
+        Streaming text completion. `handler` is an async callable invoked
+        once per chunk with the chunk's TextCompletionResponse. Returns a
+        TextCompletionResult with text=None and token counts / model taken
+        from the end_of_stream message.
+        """
+
+        async def on_chunk(resp):
+
+            if resp.error:
+                raise RuntimeError(resp.error.message)
+
+            await handler(resp)
+
+            return getattr(resp, "end_of_stream", False)
+
+        final = await self.request(
+            TextCompletionRequest(
+                system = system, prompt = prompt, streaming = True
+            ),
+            recipient=on_chunk,
+            timeout=timeout,
+        )
+
+        return TextCompletionResult(
+            text = None,
+            in_token = final.in_token,
+            out_token = final.out_token,
+            model = final.model,
+        )

 class TextCompletionClientSpec(RequestResponseSpec):
    def __init__(
@ -54,4 +78,3 @@ class TextCompletionClientSpec(RequestResponseSpec):
            response_schema = TextCompletionResponse,
            impl = TextCompletionClient,
        )
-
--- a/trustgraph-base/trustgraph/messaging/translators/agent.py
+++ b/trustgraph-base/trustgraph/messaging/translators/agent.py
@ -90,6 +90,13 @@ class AgentResponseTranslator(MessageTranslator):
        if hasattr(obj, 'error') and obj.error and obj.error.message:
            result["error"] = {"message": obj.error.message, "code": obj.error.code}

+        if obj.in_token is not None:
+            result["in_token"] = obj.in_token
+        if obj.out_token is not None:
+            result["out_token"] = obj.out_token
+        if obj.model is not None:
+            result["model"] = obj.model
+
        return result

    def encode_with_completion(self, obj: AgentResponse) -> Tuple[Dict[str, Any], bool]:
--- a/trustgraph-base/trustgraph/messaging/translators/prompt.py
+++ b/trustgraph-base/trustgraph/messaging/translators/prompt.py
@ -53,6 +53,13 @@ class PromptResponseTranslator(MessageTranslator):
        # Always include end_of_stream flag for streaming support
        result["end_of_stream"] = getattr(obj, "end_of_stream", False)

+        if obj.in_token is not None:
+            result["in_token"] = obj.in_token
+        if obj.out_token is not None:
+            result["out_token"] = obj.out_token
+        if obj.model is not None:
+            result["model"] = obj.model
+
        return result
    
    def encode_with_completion(self, obj: PromptResponse) -> Tuple[Dict[str, Any], bool]:
--- a/trustgraph-base/trustgraph/messaging/translators/retrieval.py
+++ b/trustgraph-base/trustgraph/messaging/translators/retrieval.py
@ -74,6 +74,13 @@ class DocumentRagResponseTranslator(MessageTranslator):
        if hasattr(obj, 'error') and obj.error and obj.error.message:
            result["error"] = {"message": obj.error.message, "type": obj.error.type}

+        if obj.in_token is not None:
+            result["in_token"] = obj.in_token
+        if obj.out_token is not None:
+            result["out_token"] = obj.out_token
+        if obj.model is not None:
+            result["model"] = obj.model
+
        return result

    def encode_with_completion(self, obj: DocumentRagResponse) -> Tuple[Dict[str, Any], bool]:
@ -163,6 +170,13 @@ class GraphRagResponseTranslator(MessageTranslator):
        if hasattr(obj, 'error') and obj.error and obj.error.message:
            result["error"] = {"message": obj.error.message, "type": obj.error.type}

+        if obj.in_token is not None:
+            result["in_token"] = obj.in_token
+        if obj.out_token is not None:
+            result["out_token"] = obj.out_token
+        if obj.model is not None:
+            result["model"] = obj.model
+
        return result

    def encode_with_completion(self, obj: GraphRagResponse) -> Tuple[Dict[str, Any], bool]:
--- a/trustgraph-base/trustgraph/messaging/translators/text_completion.py
+++ b/trustgraph-base/trustgraph/messaging/translators/text_completion.py
@ -29,11 +29,11 @@ class TextCompletionResponseTranslator(MessageTranslator):
    def encode(self, obj: TextCompletionResponse) -> Dict[str, Any]:
        result = {"response": obj.response}

-        if obj.in_token:
+        if obj.in_token is not None:
            result["in_token"] = obj.in_token
-        if obj.out_token:
+        if obj.out_token is not None:
            result["out_token"] = obj.out_token
-        if obj.model:
+        if obj.model is not None:
            result["model"] = obj.model

        # Always include end_of_stream flag for streaming support
--- a/trustgraph-base/trustgraph/schema/services/agent.py
+++ b/trustgraph-base/trustgraph/schema/services/agent.py
@ -66,5 +66,10 @@ class AgentResponse:

    error: Error | None = None

+    # Token usage (populated on end_of_dialog message)
+    in_token: int | None = None
+    out_token: int | None = None
+    model: str | None = None
+
 ############################################################################

--- a/trustgraph-base/trustgraph/schema/services/llm.py
+++ b/trustgraph-base/trustgraph/schema/services/llm.py
@ -17,9 +17,9 @@ class TextCompletionRequest:
 class TextCompletionResponse:
    error: Error | None = None
    response: str = ""
-    in_token: int = 0
-    out_token: int = 0
-    model: str = ""
+    in_token: int | None = None
+    out_token: int | None = None
+    model: str | None = None
    end_of_stream: bool = False  # Indicates final message in stream

 ############################################################################
--- a/trustgraph-base/trustgraph/schema/services/prompt.py
+++ b/trustgraph-base/trustgraph/schema/services/prompt.py
@ -41,4 +41,9 @@ class PromptResponse:
    # Indicates final message in stream
    end_of_stream: bool = False

+    # Token usage from the underlying text completion
+    in_token: int | None = None
+    out_token: int | None = None
+    model: str | None = None
+
 ############################################################################
--- a/trustgraph-base/trustgraph/schema/services/retrieval.py
+++ b/trustgraph-base/trustgraph/schema/services/retrieval.py
@ -29,6 +29,9 @@ class GraphRagResponse:
    explain_triples: list[Triple] = field(default_factory=list)  # Provenance triples for this step
    message_type: str = ""            # "chunk" or "explain"
    end_of_session: bool = False      # Entire session complete
+    in_token: int | None = None
+    out_token: int | None = None
+    model: str | None = None

 ############################################################################

@ -52,3 +55,6 @@ class DocumentRagResponse:
    explain_triples: list[Triple] = field(default_factory=list)  # Provenance triples for this step
    message_type: str = ""            # "chunk" or "explain"
    end_of_session: bool = False      # Entire session complete
+    in_token: int | None = None
+    out_token: int | None = None
+    model: str | None = None