Expose LLM token usage across all service layers (#782)

Expose LLM token usage (in_token, out_token, model) across all service layers Propagate token counts from LLM services through the prompt, text-completion, graph-RAG, document-RAG, and agent orchestrator pipelines to the API gateway and Python SDK. All fields are Optional — None means "not available", distinguishing from a real zero count. Key changes: - Schema: Add in_token/out_token/model to TextCompletionResponse, PromptResponse, GraphRagResponse, DocumentRagResponse, AgentResponse - TextCompletionClient: New TextCompletionResult return type. Split into text_completion() (non-streaming) and text_completion_stream() (streaming with per-chunk handler callback) - PromptClient: New PromptResult with response_type (text/json/jsonl), typed fields (text/object/objects), and token usage. All callers updated. - RAG services: Accumulate token usage across all prompt calls (extract-concepts, edge-scoring, edge-reasoning, synthesis). Non-streaming path sends single combined response instead of chunk + end_of_session. - Agent orchestrator: UsageTracker accumulates tokens across meta-router, pattern prompt calls, and react reasoning. Attached to end_of_dialog. - Translators: Encode token fields when not None (is not None, not truthy) - Python SDK: RAG and text-completion methods return TextCompletionResult (non-streaming) or RAGChunk/AgentAnswer with token fields (streaming) - CLI: --show-usage flag on tg-invoke-llm, tg-invoke-prompt, tg-invoke-graph-rag, tg-invoke-document-rag, tg-invoke-agent
2026-07-10 22:02:12 +02:00 · 2026-04-13 14:38:34 +01:00 · 2026-04-13 14:38:34 +01:00 · 14e49d83c7
commit 14e49d83c7
parent 67cfa80836
60 changed files with 1252 additions and 577 deletions
--- a/trustgraph-base/trustgraph/api/init.py
+++ b/trustgraph-base/trustgraph/api/init.py
@ -107,6 +107,7 @@ from .types import (
    AgentObservation,
    AgentAnswer,
    RAGChunk,
+    TextCompletionResult,
    ProvenanceEvent,
 )

@ -185,6 +186,7 @@ __all__ = [
    "AgentObservation",
    "AgentAnswer",
    "RAGChunk",
+    "TextCompletionResult",
    "ProvenanceEvent",

    # Exceptions
--- a/trustgraph-base/trustgraph/api/async_flow.py
+++ b/trustgraph-base/trustgraph/api/async_flow.py
@ -14,6 +14,8 @@ import aiohttp
 import json
 from typing import Optional, Dict, Any, List

+from . types import TextCompletionResult
+
 from . exceptions import ProtocolException, ApplicationException


@ -434,12 +436,11 @@ class AsyncFlowInstance:

        return await self.request("agent", request_data)

-    async def text_completion(self, system: str, prompt: str, **kwargs: Any) -> str:
+    async def text_completion(self, system: str, prompt: str, **kwargs: Any) -> TextCompletionResult:
        """
        Generate text completion (non-streaming).

        Generates a text response from an LLM given a system prompt and user prompt.
-        Returns the complete response text.

        Note: This method does not support streaming. For streaming text generation,
        use AsyncSocketFlowInstance.text_completion() instead.
@ -450,19 +451,19 @@ class AsyncFlowInstance:
            **kwargs: Additional service-specific parameters

        Returns:
-            str: Complete generated text response
+            TextCompletionResult: Result with text, in_token, out_token, model

        Example:
            ```python
            async_flow = await api.async_flow()
            flow = async_flow.id("default")

-            # Generate text
-            response = await flow.text_completion(
+            result = await flow.text_completion(
                system="You are a helpful assistant.",
                prompt="Explain quantum computing in simple terms."
            )
-            print(response)
+            print(result.text)
+            print(f"Tokens: {result.in_token} in, {result.out_token} out")
            ```
        """
        request_data = {
@ -473,7 +474,12 @@ class AsyncFlowInstance:
        request_data.update(kwargs)

        result = await self.request("text-completion", request_data)
-        return result.get("response", "")
+        return TextCompletionResult(
+            text=result.get("response", ""),
+            in_token=result.get("in_token"),
+            out_token=result.get("out_token"),
+            model=result.get("model"),
+        )

    async def graph_rag(self, query: str, user: str, collection: str,
                        max_subgraph_size: int = 1000, max_subgraph_count: int = 5,
--- a/trustgraph-base/trustgraph/api/async_socket_client.py
+++ b/trustgraph-base/trustgraph/api/async_socket_client.py
@ -4,7 +4,7 @@ import asyncio
 import websockets
 from typing import Optional, Dict, Any, AsyncIterator, Union

-from . types import AgentThought, AgentObservation, AgentAnswer, RAGChunk
+from . types import AgentThought, AgentObservation, AgentAnswer, RAGChunk, TextCompletionResult
 from . exceptions import ProtocolException, ApplicationException


@ -199,7 +199,10 @@ class AsyncSocketClient:
            return AgentAnswer(
                content=resp.get("content", ""),
                end_of_message=resp.get("end_of_message", False),
-                end_of_dialog=resp.get("end_of_dialog", False)
+                end_of_dialog=resp.get("end_of_dialog", False),
+                in_token=resp.get("in_token"),
+                out_token=resp.get("out_token"),
+                model=resp.get("model"),
            )
        elif chunk_type == "action":
            return AgentThought(
@ -211,7 +214,10 @@ class AsyncSocketClient:
            return RAGChunk(
                content=content,
                end_of_stream=resp.get("end_of_stream", False),
-                error=None
+                error=None,
+                in_token=resp.get("in_token"),
+                out_token=resp.get("out_token"),
+                model=resp.get("model"),
            )

    async def aclose(self):
@ -269,7 +275,11 @@ class AsyncSocketFlowInstance:
            return await self.client._send_request("agent", self.flow_id, request)

    async def text_completion(self, system: str, prompt: str, streaming: bool = False, **kwargs):
-        """Text completion with optional streaming"""
+        """Text completion with optional streaming.
+
+        Non-streaming: returns a TextCompletionResult with text and token counts.
+        Streaming: returns an async iterator of RAGChunk (with token counts on the final chunk).
+        """
        request = {
            "system": system,
            "prompt": prompt,
@ -281,13 +291,18 @@ class AsyncSocketFlowInstance:
            return self._text_completion_streaming(request)
        else:
            result = await self.client._send_request("text-completion", self.flow_id, request)
-            return result.get("response", "")
+            return TextCompletionResult(
+                text=result.get("response", ""),
+                in_token=result.get("in_token"),
+                out_token=result.get("out_token"),
+                model=result.get("model"),
+            )

    async def _text_completion_streaming(self, request):
-        """Helper for streaming text completion"""
+        """Helper for streaming text completion. Yields RAGChunk objects."""
        async for chunk in self.client._send_request_streaming("text-completion", self.flow_id, request):
-            if hasattr(chunk, 'content'):
-                yield chunk.content
+            if isinstance(chunk, RAGChunk):
+                yield chunk

    async def graph_rag(self, query: str, user: str, collection: str,
                        max_subgraph_size: int = 1000, max_subgraph_count: int = 5,
--- a/trustgraph-base/trustgraph/api/flow.py
+++ b/trustgraph-base/trustgraph/api/flow.py
@ -11,7 +11,7 @@ import base64

 from .. knowledge import hash, Uri, Literal, QuotedTriple
 from .. schema import IRI, LITERAL, TRIPLE
-from . types import Triple
+from . types import Triple, TextCompletionResult
 from . exceptions import ProtocolException


@ -360,16 +360,17 @@ class FlowInstance:
            prompt: User prompt/question

        Returns:
-            str: Generated response text
+            TextCompletionResult: Result with text, in_token, out_token, model

        Example:
            ```python
            flow = api.flow().id("default")
-            response = flow.text_completion(
+            result = flow.text_completion(
                system="You are a helpful assistant",
                prompt="What is quantum computing?"
            )
-            print(response)
+            print(result.text)
+            print(f"Tokens: {result.in_token} in, {result.out_token} out")
            ```
        """

@ -379,10 +380,17 @@ class FlowInstance:
            "prompt": prompt
        }

-        return self.request(
+        result = self.request(
            "service/text-completion",
            input
-        )["response"]
+        )
+
+        return TextCompletionResult(
+            text=result.get("response", ""),
+            in_token=result.get("in_token"),
+            out_token=result.get("out_token"),
+            model=result.get("model"),
+        )

    def agent(self, question, user="trustgraph", state=None, group=None, history=None):
        """
@ -498,10 +506,17 @@ class FlowInstance:
            "edge-limit": edge_limit,
        }

-        return self.request(
+        result = self.request(
            "service/graph-rag",
            input
-        )["response"]
+        )
+
+        return TextCompletionResult(
+            text=result.get("response", ""),
+            in_token=result.get("in_token"),
+            out_token=result.get("out_token"),
+            model=result.get("model"),
+        )

    def document_rag(
            self, query, user="trustgraph", collection="default",
@ -543,10 +558,17 @@ class FlowInstance:
            "doc-limit": doc_limit,
        }

-        return self.request(
+        result = self.request(
            "service/document-rag",
            input
-        )["response"]
+        )
+
+        return TextCompletionResult(
+            text=result.get("response", ""),
+            in_token=result.get("in_token"),
+            out_token=result.get("out_token"),
+            model=result.get("model"),
+        )

    def embeddings(self, texts):
        """
--- a/trustgraph-base/trustgraph/api/socket_client.py
+++ b/trustgraph-base/trustgraph/api/socket_client.py
@ -14,7 +14,7 @@ import websockets
 from typing import Optional, Dict, Any, Iterator, Union, List
 from threading import Lock

-from . types import AgentThought, AgentObservation, AgentAnswer, RAGChunk, StreamingChunk, ProvenanceEvent
+from . types import AgentThought, AgentObservation, AgentAnswer, RAGChunk, StreamingChunk, ProvenanceEvent, TextCompletionResult
 from . exceptions import ProtocolException, raise_from_error_dict


@ -393,6 +393,9 @@ class SocketClient:
                end_of_message=resp.get("end_of_message", False),
                end_of_dialog=resp.get("end_of_dialog", False),
                message_id=resp.get("message_id", ""),
+                in_token=resp.get("in_token"),
+                out_token=resp.get("out_token"),
+                model=resp.get("model"),
            )
        elif chunk_type == "action":
            return AgentThought(
@ -404,7 +407,10 @@ class SocketClient:
            return RAGChunk(
                content=content,
                end_of_stream=resp.get("end_of_stream", False),
-                error=None
+                error=None,
+                in_token=resp.get("in_token"),
+                out_token=resp.get("out_token"),
+                model=resp.get("model"),
            )

    def _build_provenance_event(self, resp: Dict[str, Any]) -> ProvenanceEvent:
@ -543,8 +549,12 @@ class SocketFlowInstance:
            streaming=True, include_provenance=True
        )

-    def text_completion(self, system: str, prompt: str, streaming: bool = False, **kwargs) -> Union[str, Iterator[str]]:
-        """Execute text completion with optional streaming."""
+    def text_completion(self, system: str, prompt: str, streaming: bool = False, **kwargs) -> Union[TextCompletionResult, Iterator[RAGChunk]]:
+        """Execute text completion with optional streaming.
+
+        Non-streaming: returns a TextCompletionResult with text and token counts.
+        Streaming: returns an iterator of RAGChunk (with token counts on the final chunk).
+        """
        request = {
            "system": system,
            "prompt": prompt,
@ -557,12 +567,17 @@ class SocketFlowInstance:
        if streaming:
            return self._text_completion_generator(result)
        else:
-            return result.get("response", "")
+            return TextCompletionResult(
+                text=result.get("response", ""),
+                in_token=result.get("in_token"),
+                out_token=result.get("out_token"),
+                model=result.get("model"),
+            )

-    def _text_completion_generator(self, result: Iterator[StreamingChunk]) -> Iterator[str]:
+    def _text_completion_generator(self, result: Iterator[StreamingChunk]) -> Iterator[RAGChunk]:
        for chunk in result:
-            if hasattr(chunk, 'content'):
-                yield chunk.content
+            if isinstance(chunk, RAGChunk):
+                yield chunk

    def graph_rag(
        self,
@ -577,8 +592,12 @@ class SocketFlowInstance:
        edge_limit: int = 25,
        streaming: bool = False,
        **kwargs: Any
-    ) -> Union[str, Iterator[str]]:
-        """Execute graph-based RAG query with optional streaming."""
+    ) -> Union[TextCompletionResult, Iterator[RAGChunk]]:
+        """Execute graph-based RAG query with optional streaming.
+
+        Non-streaming: returns a TextCompletionResult with text and token counts.
+        Streaming: returns an iterator of RAGChunk (with token counts on the final chunk).
+        """
        request = {
            "query": query,
            "user": user,
@ -598,7 +617,12 @@ class SocketFlowInstance:
        if streaming:
            return self._rag_generator(result)
        else:
-            return result.get("response", "")
+            return TextCompletionResult(
+                text=result.get("response", ""),
+                in_token=result.get("in_token"),
+                out_token=result.get("out_token"),
+                model=result.get("model"),
+            )

    def graph_rag_explain(
        self,
@ -642,8 +666,12 @@ class SocketFlowInstance:
        doc_limit: int = 10,
        streaming: bool = False,
        **kwargs: Any
-    ) -> Union[str, Iterator[str]]:
-        """Execute document-based RAG query with optional streaming."""
+    ) -> Union[TextCompletionResult, Iterator[RAGChunk]]:
+        """Execute document-based RAG query with optional streaming.
+
+        Non-streaming: returns a TextCompletionResult with text and token counts.
+        Streaming: returns an iterator of RAGChunk (with token counts on the final chunk).
+        """
        request = {
            "query": query,
            "user": user,
@ -658,7 +686,12 @@ class SocketFlowInstance:
        if streaming:
            return self._rag_generator(result)
        else:
-            return result.get("response", "")
+            return TextCompletionResult(
+                text=result.get("response", ""),
+                in_token=result.get("in_token"),
+                out_token=result.get("out_token"),
+                model=result.get("model"),
+            )

    def document_rag_explain(
        self,
@ -684,10 +717,10 @@ class SocketFlowInstance:
            streaming=True, include_provenance=True
        )

-    def _rag_generator(self, result: Iterator[StreamingChunk]) -> Iterator[str]:
+    def _rag_generator(self, result: Iterator[StreamingChunk]) -> Iterator[RAGChunk]:
        for chunk in result:
-            if hasattr(chunk, 'content'):
-                yield chunk.content
+            if isinstance(chunk, RAGChunk):
+                yield chunk

    def prompt(
        self,
@ -695,8 +728,12 @@ class SocketFlowInstance:
        variables: Dict[str, str],
        streaming: bool = False,
        **kwargs: Any
-    ) -> Union[str, Iterator[str]]:
-        """Execute a prompt template with optional streaming."""
+    ) -> Union[TextCompletionResult, Iterator[RAGChunk]]:
+        """Execute a prompt template with optional streaming.
+
+        Non-streaming: returns a TextCompletionResult with text and token counts.
+        Streaming: returns an iterator of RAGChunk (with token counts on the final chunk).
+        """
        request = {
            "id": id,
            "variables": variables,
@ -709,7 +746,12 @@ class SocketFlowInstance:
        if streaming:
            return self._rag_generator(result)
        else:
-            return result.get("response", "")
+            return TextCompletionResult(
+                text=result.get("text", result.get("response", "")),
+                in_token=result.get("in_token"),
+                out_token=result.get("out_token"),
+                model=result.get("model"),
+            )

    def graph_embeddings_query(
        self,
--- a/trustgraph-base/trustgraph/api/types.py
+++ b/trustgraph-base/trustgraph/api/types.py
@ -189,6 +189,9 @@ class AgentAnswer(StreamingChunk):
    chunk_type: str = "final-answer"
    end_of_dialog: bool = False
    message_id: str = ""
+    in_token: Optional[int] = None
+    out_token: Optional[int] = None
+    model: Optional[str] = None

@dataclasses.dataclass
 class RAGChunk(StreamingChunk):
@ -202,11 +205,37 @@ class RAGChunk(StreamingChunk):
        content: Generated text content
        end_of_stream: True if this is the final chunk of the stream
        error: Optional error information if an error occurred
+        in_token: Input token count (populated on the final chunk, 0 otherwise)
+        out_token: Output token count (populated on the final chunk, 0 otherwise)
+        model: Model identifier (populated on the final chunk, empty otherwise)
        chunk_type: Always "rag"
    """
    chunk_type: str = "rag"
    end_of_stream: bool = False
    error: Optional[Dict[str, str]] = None
+    in_token: Optional[int] = None
+    out_token: Optional[int] = None
+    model: Optional[str] = None
+
+@dataclasses.dataclass
+class TextCompletionResult:
+    """
+    Result from a text completion request.
+
+    Returned by text_completion() in both streaming and non-streaming modes.
+    In streaming mode, text is None (chunks are delivered via the iterator).
+    In non-streaming mode, text contains the complete response.
+
+    Attributes:
+        text: Complete response text (None in streaming mode)
+        in_token: Input token count (None if not available)
+        out_token: Output token count (None if not available)
+        model: Model identifier (None if not available)
+    """
+    text: Optional[str]
+    in_token: Optional[int] = None
+    out_token: Optional[int] = None
+    model: Optional[str] = None

@dataclasses.dataclass
 class ProvenanceEvent: