Expose LLM token usage across all service layers (#782)

Expose LLM token usage (in_token, out_token, model) across all
service layers

Propagate token counts from LLM services through the prompt,
text-completion, graph-RAG, document-RAG, and agent orchestrator
pipelines to the API gateway and Python SDK. All fields are Optional
— None means "not available", distinguishing from a real zero count.

Key changes:

- Schema: Add in_token/out_token/model to TextCompletionResponse,
  PromptResponse, GraphRagResponse, DocumentRagResponse,
  AgentResponse

- TextCompletionClient: New TextCompletionResult return type. Split
  into text_completion() (non-streaming) and
  text_completion_stream() (streaming with per-chunk handler
  callback)

- PromptClient: New PromptResult with response_type
  (text/json/jsonl), typed fields (text/object/objects), and token
  usage. All callers updated.

- RAG services: Accumulate token usage across all prompt calls
  (extract-concepts, edge-scoring, edge-reasoning,
  synthesis). Non-streaming path sends single combined response
  instead of chunk + end_of_session.

- Agent orchestrator: UsageTracker accumulates tokens across
  meta-router, pattern prompt calls, and react reasoning. Attached
  to end_of_dialog.

- Translators: Encode token fields when not None (is not None, not truthy)

- Python SDK: RAG and text-completion methods return
  TextCompletionResult (non-streaming) or RAGChunk/AgentAnswer with
  token fields (streaming)

- CLI: --show-usage flag on tg-invoke-llm, tg-invoke-prompt,
  tg-invoke-graph-rag, tg-invoke-document-rag, tg-invoke-agent
This commit is contained in:
cybermaggedon 2026-04-13 14:38:34 +01:00 committed by GitHub
parent 67cfa80836
commit 14e49d83c7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
60 changed files with 1252 additions and 577 deletions

View file

@ -107,6 +107,7 @@ from .types import (
AgentObservation,
AgentAnswer,
RAGChunk,
TextCompletionResult,
ProvenanceEvent,
)
@ -185,6 +186,7 @@ __all__ = [
"AgentObservation",
"AgentAnswer",
"RAGChunk",
"TextCompletionResult",
"ProvenanceEvent",
# Exceptions

View file

@ -14,6 +14,8 @@ import aiohttp
import json
from typing import Optional, Dict, Any, List
from . types import TextCompletionResult
from . exceptions import ProtocolException, ApplicationException
@ -434,12 +436,11 @@ class AsyncFlowInstance:
return await self.request("agent", request_data)
async def text_completion(self, system: str, prompt: str, **kwargs: Any) -> str:
async def text_completion(self, system: str, prompt: str, **kwargs: Any) -> TextCompletionResult:
"""
Generate text completion (non-streaming).
Generates a text response from an LLM given a system prompt and user prompt.
Returns the complete response text.
Note: This method does not support streaming. For streaming text generation,
use AsyncSocketFlowInstance.text_completion() instead.
@ -450,19 +451,19 @@ class AsyncFlowInstance:
**kwargs: Additional service-specific parameters
Returns:
str: Complete generated text response
TextCompletionResult: Result with text, in_token, out_token, model
Example:
```python
async_flow = await api.async_flow()
flow = async_flow.id("default")
# Generate text
response = await flow.text_completion(
result = await flow.text_completion(
system="You are a helpful assistant.",
prompt="Explain quantum computing in simple terms."
)
print(response)
print(result.text)
print(f"Tokens: {result.in_token} in, {result.out_token} out")
```
"""
request_data = {
@ -473,7 +474,12 @@ class AsyncFlowInstance:
request_data.update(kwargs)
result = await self.request("text-completion", request_data)
return result.get("response", "")
return TextCompletionResult(
text=result.get("response", ""),
in_token=result.get("in_token"),
out_token=result.get("out_token"),
model=result.get("model"),
)
async def graph_rag(self, query: str, user: str, collection: str,
max_subgraph_size: int = 1000, max_subgraph_count: int = 5,

View file

@ -4,7 +4,7 @@ import asyncio
import websockets
from typing import Optional, Dict, Any, AsyncIterator, Union
from . types import AgentThought, AgentObservation, AgentAnswer, RAGChunk
from . types import AgentThought, AgentObservation, AgentAnswer, RAGChunk, TextCompletionResult
from . exceptions import ProtocolException, ApplicationException
@ -199,7 +199,10 @@ class AsyncSocketClient:
return AgentAnswer(
content=resp.get("content", ""),
end_of_message=resp.get("end_of_message", False),
end_of_dialog=resp.get("end_of_dialog", False)
end_of_dialog=resp.get("end_of_dialog", False),
in_token=resp.get("in_token"),
out_token=resp.get("out_token"),
model=resp.get("model"),
)
elif chunk_type == "action":
return AgentThought(
@ -211,7 +214,10 @@ class AsyncSocketClient:
return RAGChunk(
content=content,
end_of_stream=resp.get("end_of_stream", False),
error=None
error=None,
in_token=resp.get("in_token"),
out_token=resp.get("out_token"),
model=resp.get("model"),
)
async def aclose(self):
@ -269,7 +275,11 @@ class AsyncSocketFlowInstance:
return await self.client._send_request("agent", self.flow_id, request)
async def text_completion(self, system: str, prompt: str, streaming: bool = False, **kwargs):
"""Text completion with optional streaming"""
"""Text completion with optional streaming.
Non-streaming: returns a TextCompletionResult with text and token counts.
Streaming: returns an async iterator of RAGChunk (with token counts on the final chunk).
"""
request = {
"system": system,
"prompt": prompt,
@ -281,13 +291,18 @@ class AsyncSocketFlowInstance:
return self._text_completion_streaming(request)
else:
result = await self.client._send_request("text-completion", self.flow_id, request)
return result.get("response", "")
return TextCompletionResult(
text=result.get("response", ""),
in_token=result.get("in_token"),
out_token=result.get("out_token"),
model=result.get("model"),
)
async def _text_completion_streaming(self, request):
"""Helper for streaming text completion"""
"""Helper for streaming text completion. Yields RAGChunk objects."""
async for chunk in self.client._send_request_streaming("text-completion", self.flow_id, request):
if hasattr(chunk, 'content'):
yield chunk.content
if isinstance(chunk, RAGChunk):
yield chunk
async def graph_rag(self, query: str, user: str, collection: str,
max_subgraph_size: int = 1000, max_subgraph_count: int = 5,

View file

@ -11,7 +11,7 @@ import base64
from .. knowledge import hash, Uri, Literal, QuotedTriple
from .. schema import IRI, LITERAL, TRIPLE
from . types import Triple
from . types import Triple, TextCompletionResult
from . exceptions import ProtocolException
@ -360,16 +360,17 @@ class FlowInstance:
prompt: User prompt/question
Returns:
str: Generated response text
TextCompletionResult: Result with text, in_token, out_token, model
Example:
```python
flow = api.flow().id("default")
response = flow.text_completion(
result = flow.text_completion(
system="You are a helpful assistant",
prompt="What is quantum computing?"
)
print(response)
print(result.text)
print(f"Tokens: {result.in_token} in, {result.out_token} out")
```
"""
@ -379,10 +380,17 @@ class FlowInstance:
"prompt": prompt
}
return self.request(
result = self.request(
"service/text-completion",
input
)["response"]
)
return TextCompletionResult(
text=result.get("response", ""),
in_token=result.get("in_token"),
out_token=result.get("out_token"),
model=result.get("model"),
)
def agent(self, question, user="trustgraph", state=None, group=None, history=None):
"""
@ -498,10 +506,17 @@ class FlowInstance:
"edge-limit": edge_limit,
}
return self.request(
result = self.request(
"service/graph-rag",
input
)["response"]
)
return TextCompletionResult(
text=result.get("response", ""),
in_token=result.get("in_token"),
out_token=result.get("out_token"),
model=result.get("model"),
)
def document_rag(
self, query, user="trustgraph", collection="default",
@ -543,10 +558,17 @@ class FlowInstance:
"doc-limit": doc_limit,
}
return self.request(
result = self.request(
"service/document-rag",
input
)["response"]
)
return TextCompletionResult(
text=result.get("response", ""),
in_token=result.get("in_token"),
out_token=result.get("out_token"),
model=result.get("model"),
)
def embeddings(self, texts):
"""

View file

@ -14,7 +14,7 @@ import websockets
from typing import Optional, Dict, Any, Iterator, Union, List
from threading import Lock
from . types import AgentThought, AgentObservation, AgentAnswer, RAGChunk, StreamingChunk, ProvenanceEvent
from . types import AgentThought, AgentObservation, AgentAnswer, RAGChunk, StreamingChunk, ProvenanceEvent, TextCompletionResult
from . exceptions import ProtocolException, raise_from_error_dict
@ -393,6 +393,9 @@ class SocketClient:
end_of_message=resp.get("end_of_message", False),
end_of_dialog=resp.get("end_of_dialog", False),
message_id=resp.get("message_id", ""),
in_token=resp.get("in_token"),
out_token=resp.get("out_token"),
model=resp.get("model"),
)
elif chunk_type == "action":
return AgentThought(
@ -404,7 +407,10 @@ class SocketClient:
return RAGChunk(
content=content,
end_of_stream=resp.get("end_of_stream", False),
error=None
error=None,
in_token=resp.get("in_token"),
out_token=resp.get("out_token"),
model=resp.get("model"),
)
def _build_provenance_event(self, resp: Dict[str, Any]) -> ProvenanceEvent:
@ -543,8 +549,12 @@ class SocketFlowInstance:
streaming=True, include_provenance=True
)
def text_completion(self, system: str, prompt: str, streaming: bool = False, **kwargs) -> Union[str, Iterator[str]]:
"""Execute text completion with optional streaming."""
def text_completion(self, system: str, prompt: str, streaming: bool = False, **kwargs) -> Union[TextCompletionResult, Iterator[RAGChunk]]:
"""Execute text completion with optional streaming.
Non-streaming: returns a TextCompletionResult with text and token counts.
Streaming: returns an iterator of RAGChunk (with token counts on the final chunk).
"""
request = {
"system": system,
"prompt": prompt,
@ -557,12 +567,17 @@ class SocketFlowInstance:
if streaming:
return self._text_completion_generator(result)
else:
return result.get("response", "")
return TextCompletionResult(
text=result.get("response", ""),
in_token=result.get("in_token"),
out_token=result.get("out_token"),
model=result.get("model"),
)
def _text_completion_generator(self, result: Iterator[StreamingChunk]) -> Iterator[str]:
def _text_completion_generator(self, result: Iterator[StreamingChunk]) -> Iterator[RAGChunk]:
for chunk in result:
if hasattr(chunk, 'content'):
yield chunk.content
if isinstance(chunk, RAGChunk):
yield chunk
def graph_rag(
self,
@ -577,8 +592,12 @@ class SocketFlowInstance:
edge_limit: int = 25,
streaming: bool = False,
**kwargs: Any
) -> Union[str, Iterator[str]]:
"""Execute graph-based RAG query with optional streaming."""
) -> Union[TextCompletionResult, Iterator[RAGChunk]]:
"""Execute graph-based RAG query with optional streaming.
Non-streaming: returns a TextCompletionResult with text and token counts.
Streaming: returns an iterator of RAGChunk (with token counts on the final chunk).
"""
request = {
"query": query,
"user": user,
@ -598,7 +617,12 @@ class SocketFlowInstance:
if streaming:
return self._rag_generator(result)
else:
return result.get("response", "")
return TextCompletionResult(
text=result.get("response", ""),
in_token=result.get("in_token"),
out_token=result.get("out_token"),
model=result.get("model"),
)
def graph_rag_explain(
self,
@ -642,8 +666,12 @@ class SocketFlowInstance:
doc_limit: int = 10,
streaming: bool = False,
**kwargs: Any
) -> Union[str, Iterator[str]]:
"""Execute document-based RAG query with optional streaming."""
) -> Union[TextCompletionResult, Iterator[RAGChunk]]:
"""Execute document-based RAG query with optional streaming.
Non-streaming: returns a TextCompletionResult with text and token counts.
Streaming: returns an iterator of RAGChunk (with token counts on the final chunk).
"""
request = {
"query": query,
"user": user,
@ -658,7 +686,12 @@ class SocketFlowInstance:
if streaming:
return self._rag_generator(result)
else:
return result.get("response", "")
return TextCompletionResult(
text=result.get("response", ""),
in_token=result.get("in_token"),
out_token=result.get("out_token"),
model=result.get("model"),
)
def document_rag_explain(
self,
@ -684,10 +717,10 @@ class SocketFlowInstance:
streaming=True, include_provenance=True
)
def _rag_generator(self, result: Iterator[StreamingChunk]) -> Iterator[str]:
def _rag_generator(self, result: Iterator[StreamingChunk]) -> Iterator[RAGChunk]:
for chunk in result:
if hasattr(chunk, 'content'):
yield chunk.content
if isinstance(chunk, RAGChunk):
yield chunk
def prompt(
self,
@ -695,8 +728,12 @@ class SocketFlowInstance:
variables: Dict[str, str],
streaming: bool = False,
**kwargs: Any
) -> Union[str, Iterator[str]]:
"""Execute a prompt template with optional streaming."""
) -> Union[TextCompletionResult, Iterator[RAGChunk]]:
"""Execute a prompt template with optional streaming.
Non-streaming: returns a TextCompletionResult with text and token counts.
Streaming: returns an iterator of RAGChunk (with token counts on the final chunk).
"""
request = {
"id": id,
"variables": variables,
@ -709,7 +746,12 @@ class SocketFlowInstance:
if streaming:
return self._rag_generator(result)
else:
return result.get("response", "")
return TextCompletionResult(
text=result.get("text", result.get("response", "")),
in_token=result.get("in_token"),
out_token=result.get("out_token"),
model=result.get("model"),
)
def graph_embeddings_query(
self,

View file

@ -189,6 +189,9 @@ class AgentAnswer(StreamingChunk):
chunk_type: str = "final-answer"
end_of_dialog: bool = False
message_id: str = ""
in_token: Optional[int] = None
out_token: Optional[int] = None
model: Optional[str] = None
@dataclasses.dataclass
class RAGChunk(StreamingChunk):
@ -202,11 +205,37 @@ class RAGChunk(StreamingChunk):
content: Generated text content
end_of_stream: True if this is the final chunk of the stream
error: Optional error information if an error occurred
in_token: Input token count (populated on the final chunk, 0 otherwise)
out_token: Output token count (populated on the final chunk, 0 otherwise)
model: Model identifier (populated on the final chunk, empty otherwise)
chunk_type: Always "rag"
"""
chunk_type: str = "rag"
end_of_stream: bool = False
error: Optional[Dict[str, str]] = None
in_token: Optional[int] = None
out_token: Optional[int] = None
model: Optional[str] = None
@dataclasses.dataclass
class TextCompletionResult:
"""
Result from a text completion request.
Returned by text_completion() in both streaming and non-streaming modes.
In streaming mode, text is None (chunks are delivered via the iterator).
In non-streaming mode, text contains the complete response.
Attributes:
text: Complete response text (None in streaming mode)
in_token: Input token count (None if not available)
out_token: Output token count (None if not available)
model: Model identifier (None if not available)
"""
text: Optional[str]
in_token: Optional[int] = None
out_token: Optional[int] = None
model: Optional[str] = None
@dataclasses.dataclass
class ProvenanceEvent: