mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Expose LLM token usage across all service layers (#782)
Expose LLM token usage (in_token, out_token, model) across all service layers Propagate token counts from LLM services through the prompt, text-completion, graph-RAG, document-RAG, and agent orchestrator pipelines to the API gateway and Python SDK. All fields are Optional — None means "not available", distinguishing from a real zero count. Key changes: - Schema: Add in_token/out_token/model to TextCompletionResponse, PromptResponse, GraphRagResponse, DocumentRagResponse, AgentResponse - TextCompletionClient: New TextCompletionResult return type. Split into text_completion() (non-streaming) and text_completion_stream() (streaming with per-chunk handler callback) - PromptClient: New PromptResult with response_type (text/json/jsonl), typed fields (text/object/objects), and token usage. All callers updated. - RAG services: Accumulate token usage across all prompt calls (extract-concepts, edge-scoring, edge-reasoning, synthesis). Non-streaming path sends single combined response instead of chunk + end_of_session. - Agent orchestrator: UsageTracker accumulates tokens across meta-router, pattern prompt calls, and react reasoning. Attached to end_of_dialog. - Translators: Encode token fields when not None (is not None, not truthy) - Python SDK: RAG and text-completion methods return TextCompletionResult (non-streaming) or RAGChunk/AgentAnswer with token fields (streaming) - CLI: --show-usage flag on tg-invoke-llm, tg-invoke-prompt, tg-invoke-graph-rag, tg-invoke-document-rag, tg-invoke-agent
This commit is contained in:
parent
67cfa80836
commit
14e49d83c7
60 changed files with 1252 additions and 577 deletions
|
|
@ -107,6 +107,7 @@ from .types import (
|
|||
AgentObservation,
|
||||
AgentAnswer,
|
||||
RAGChunk,
|
||||
TextCompletionResult,
|
||||
ProvenanceEvent,
|
||||
)
|
||||
|
||||
|
|
@ -185,6 +186,7 @@ __all__ = [
|
|||
"AgentObservation",
|
||||
"AgentAnswer",
|
||||
"RAGChunk",
|
||||
"TextCompletionResult",
|
||||
"ProvenanceEvent",
|
||||
|
||||
# Exceptions
|
||||
|
|
|
|||
|
|
@ -14,6 +14,8 @@ import aiohttp
|
|||
import json
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
from . types import TextCompletionResult
|
||||
|
||||
from . exceptions import ProtocolException, ApplicationException
|
||||
|
||||
|
||||
|
|
@ -434,12 +436,11 @@ class AsyncFlowInstance:
|
|||
|
||||
return await self.request("agent", request_data)
|
||||
|
||||
async def text_completion(self, system: str, prompt: str, **kwargs: Any) -> str:
|
||||
async def text_completion(self, system: str, prompt: str, **kwargs: Any) -> TextCompletionResult:
|
||||
"""
|
||||
Generate text completion (non-streaming).
|
||||
|
||||
Generates a text response from an LLM given a system prompt and user prompt.
|
||||
Returns the complete response text.
|
||||
|
||||
Note: This method does not support streaming. For streaming text generation,
|
||||
use AsyncSocketFlowInstance.text_completion() instead.
|
||||
|
|
@ -450,19 +451,19 @@ class AsyncFlowInstance:
|
|||
**kwargs: Additional service-specific parameters
|
||||
|
||||
Returns:
|
||||
str: Complete generated text response
|
||||
TextCompletionResult: Result with text, in_token, out_token, model
|
||||
|
||||
Example:
|
||||
```python
|
||||
async_flow = await api.async_flow()
|
||||
flow = async_flow.id("default")
|
||||
|
||||
# Generate text
|
||||
response = await flow.text_completion(
|
||||
result = await flow.text_completion(
|
||||
system="You are a helpful assistant.",
|
||||
prompt="Explain quantum computing in simple terms."
|
||||
)
|
||||
print(response)
|
||||
print(result.text)
|
||||
print(f"Tokens: {result.in_token} in, {result.out_token} out")
|
||||
```
|
||||
"""
|
||||
request_data = {
|
||||
|
|
@ -473,7 +474,12 @@ class AsyncFlowInstance:
|
|||
request_data.update(kwargs)
|
||||
|
||||
result = await self.request("text-completion", request_data)
|
||||
return result.get("response", "")
|
||||
return TextCompletionResult(
|
||||
text=result.get("response", ""),
|
||||
in_token=result.get("in_token"),
|
||||
out_token=result.get("out_token"),
|
||||
model=result.get("model"),
|
||||
)
|
||||
|
||||
async def graph_rag(self, query: str, user: str, collection: str,
|
||||
max_subgraph_size: int = 1000, max_subgraph_count: int = 5,
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import asyncio
|
|||
import websockets
|
||||
from typing import Optional, Dict, Any, AsyncIterator, Union
|
||||
|
||||
from . types import AgentThought, AgentObservation, AgentAnswer, RAGChunk
|
||||
from . types import AgentThought, AgentObservation, AgentAnswer, RAGChunk, TextCompletionResult
|
||||
from . exceptions import ProtocolException, ApplicationException
|
||||
|
||||
|
||||
|
|
@ -199,7 +199,10 @@ class AsyncSocketClient:
|
|||
return AgentAnswer(
|
||||
content=resp.get("content", ""),
|
||||
end_of_message=resp.get("end_of_message", False),
|
||||
end_of_dialog=resp.get("end_of_dialog", False)
|
||||
end_of_dialog=resp.get("end_of_dialog", False),
|
||||
in_token=resp.get("in_token"),
|
||||
out_token=resp.get("out_token"),
|
||||
model=resp.get("model"),
|
||||
)
|
||||
elif chunk_type == "action":
|
||||
return AgentThought(
|
||||
|
|
@ -211,7 +214,10 @@ class AsyncSocketClient:
|
|||
return RAGChunk(
|
||||
content=content,
|
||||
end_of_stream=resp.get("end_of_stream", False),
|
||||
error=None
|
||||
error=None,
|
||||
in_token=resp.get("in_token"),
|
||||
out_token=resp.get("out_token"),
|
||||
model=resp.get("model"),
|
||||
)
|
||||
|
||||
async def aclose(self):
|
||||
|
|
@ -269,7 +275,11 @@ class AsyncSocketFlowInstance:
|
|||
return await self.client._send_request("agent", self.flow_id, request)
|
||||
|
||||
async def text_completion(self, system: str, prompt: str, streaming: bool = False, **kwargs):
|
||||
"""Text completion with optional streaming"""
|
||||
"""Text completion with optional streaming.
|
||||
|
||||
Non-streaming: returns a TextCompletionResult with text and token counts.
|
||||
Streaming: returns an async iterator of RAGChunk (with token counts on the final chunk).
|
||||
"""
|
||||
request = {
|
||||
"system": system,
|
||||
"prompt": prompt,
|
||||
|
|
@ -281,13 +291,18 @@ class AsyncSocketFlowInstance:
|
|||
return self._text_completion_streaming(request)
|
||||
else:
|
||||
result = await self.client._send_request("text-completion", self.flow_id, request)
|
||||
return result.get("response", "")
|
||||
return TextCompletionResult(
|
||||
text=result.get("response", ""),
|
||||
in_token=result.get("in_token"),
|
||||
out_token=result.get("out_token"),
|
||||
model=result.get("model"),
|
||||
)
|
||||
|
||||
async def _text_completion_streaming(self, request):
|
||||
"""Helper for streaming text completion"""
|
||||
"""Helper for streaming text completion. Yields RAGChunk objects."""
|
||||
async for chunk in self.client._send_request_streaming("text-completion", self.flow_id, request):
|
||||
if hasattr(chunk, 'content'):
|
||||
yield chunk.content
|
||||
if isinstance(chunk, RAGChunk):
|
||||
yield chunk
|
||||
|
||||
async def graph_rag(self, query: str, user: str, collection: str,
|
||||
max_subgraph_size: int = 1000, max_subgraph_count: int = 5,
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ import base64
|
|||
|
||||
from .. knowledge import hash, Uri, Literal, QuotedTriple
|
||||
from .. schema import IRI, LITERAL, TRIPLE
|
||||
from . types import Triple
|
||||
from . types import Triple, TextCompletionResult
|
||||
from . exceptions import ProtocolException
|
||||
|
||||
|
||||
|
|
@ -360,16 +360,17 @@ class FlowInstance:
|
|||
prompt: User prompt/question
|
||||
|
||||
Returns:
|
||||
str: Generated response text
|
||||
TextCompletionResult: Result with text, in_token, out_token, model
|
||||
|
||||
Example:
|
||||
```python
|
||||
flow = api.flow().id("default")
|
||||
response = flow.text_completion(
|
||||
result = flow.text_completion(
|
||||
system="You are a helpful assistant",
|
||||
prompt="What is quantum computing?"
|
||||
)
|
||||
print(response)
|
||||
print(result.text)
|
||||
print(f"Tokens: {result.in_token} in, {result.out_token} out")
|
||||
```
|
||||
"""
|
||||
|
||||
|
|
@ -379,10 +380,17 @@ class FlowInstance:
|
|||
"prompt": prompt
|
||||
}
|
||||
|
||||
return self.request(
|
||||
result = self.request(
|
||||
"service/text-completion",
|
||||
input
|
||||
)["response"]
|
||||
)
|
||||
|
||||
return TextCompletionResult(
|
||||
text=result.get("response", ""),
|
||||
in_token=result.get("in_token"),
|
||||
out_token=result.get("out_token"),
|
||||
model=result.get("model"),
|
||||
)
|
||||
|
||||
def agent(self, question, user="trustgraph", state=None, group=None, history=None):
|
||||
"""
|
||||
|
|
@ -498,10 +506,17 @@ class FlowInstance:
|
|||
"edge-limit": edge_limit,
|
||||
}
|
||||
|
||||
return self.request(
|
||||
result = self.request(
|
||||
"service/graph-rag",
|
||||
input
|
||||
)["response"]
|
||||
)
|
||||
|
||||
return TextCompletionResult(
|
||||
text=result.get("response", ""),
|
||||
in_token=result.get("in_token"),
|
||||
out_token=result.get("out_token"),
|
||||
model=result.get("model"),
|
||||
)
|
||||
|
||||
def document_rag(
|
||||
self, query, user="trustgraph", collection="default",
|
||||
|
|
@ -543,10 +558,17 @@ class FlowInstance:
|
|||
"doc-limit": doc_limit,
|
||||
}
|
||||
|
||||
return self.request(
|
||||
result = self.request(
|
||||
"service/document-rag",
|
||||
input
|
||||
)["response"]
|
||||
)
|
||||
|
||||
return TextCompletionResult(
|
||||
text=result.get("response", ""),
|
||||
in_token=result.get("in_token"),
|
||||
out_token=result.get("out_token"),
|
||||
model=result.get("model"),
|
||||
)
|
||||
|
||||
def embeddings(self, texts):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ import websockets
|
|||
from typing import Optional, Dict, Any, Iterator, Union, List
|
||||
from threading import Lock
|
||||
|
||||
from . types import AgentThought, AgentObservation, AgentAnswer, RAGChunk, StreamingChunk, ProvenanceEvent
|
||||
from . types import AgentThought, AgentObservation, AgentAnswer, RAGChunk, StreamingChunk, ProvenanceEvent, TextCompletionResult
|
||||
from . exceptions import ProtocolException, raise_from_error_dict
|
||||
|
||||
|
||||
|
|
@ -393,6 +393,9 @@ class SocketClient:
|
|||
end_of_message=resp.get("end_of_message", False),
|
||||
end_of_dialog=resp.get("end_of_dialog", False),
|
||||
message_id=resp.get("message_id", ""),
|
||||
in_token=resp.get("in_token"),
|
||||
out_token=resp.get("out_token"),
|
||||
model=resp.get("model"),
|
||||
)
|
||||
elif chunk_type == "action":
|
||||
return AgentThought(
|
||||
|
|
@ -404,7 +407,10 @@ class SocketClient:
|
|||
return RAGChunk(
|
||||
content=content,
|
||||
end_of_stream=resp.get("end_of_stream", False),
|
||||
error=None
|
||||
error=None,
|
||||
in_token=resp.get("in_token"),
|
||||
out_token=resp.get("out_token"),
|
||||
model=resp.get("model"),
|
||||
)
|
||||
|
||||
def _build_provenance_event(self, resp: Dict[str, Any]) -> ProvenanceEvent:
|
||||
|
|
@ -543,8 +549,12 @@ class SocketFlowInstance:
|
|||
streaming=True, include_provenance=True
|
||||
)
|
||||
|
||||
def text_completion(self, system: str, prompt: str, streaming: bool = False, **kwargs) -> Union[str, Iterator[str]]:
|
||||
"""Execute text completion with optional streaming."""
|
||||
def text_completion(self, system: str, prompt: str, streaming: bool = False, **kwargs) -> Union[TextCompletionResult, Iterator[RAGChunk]]:
|
||||
"""Execute text completion with optional streaming.
|
||||
|
||||
Non-streaming: returns a TextCompletionResult with text and token counts.
|
||||
Streaming: returns an iterator of RAGChunk (with token counts on the final chunk).
|
||||
"""
|
||||
request = {
|
||||
"system": system,
|
||||
"prompt": prompt,
|
||||
|
|
@ -557,12 +567,17 @@ class SocketFlowInstance:
|
|||
if streaming:
|
||||
return self._text_completion_generator(result)
|
||||
else:
|
||||
return result.get("response", "")
|
||||
return TextCompletionResult(
|
||||
text=result.get("response", ""),
|
||||
in_token=result.get("in_token"),
|
||||
out_token=result.get("out_token"),
|
||||
model=result.get("model"),
|
||||
)
|
||||
|
||||
def _text_completion_generator(self, result: Iterator[StreamingChunk]) -> Iterator[str]:
|
||||
def _text_completion_generator(self, result: Iterator[StreamingChunk]) -> Iterator[RAGChunk]:
|
||||
for chunk in result:
|
||||
if hasattr(chunk, 'content'):
|
||||
yield chunk.content
|
||||
if isinstance(chunk, RAGChunk):
|
||||
yield chunk
|
||||
|
||||
def graph_rag(
|
||||
self,
|
||||
|
|
@ -577,8 +592,12 @@ class SocketFlowInstance:
|
|||
edge_limit: int = 25,
|
||||
streaming: bool = False,
|
||||
**kwargs: Any
|
||||
) -> Union[str, Iterator[str]]:
|
||||
"""Execute graph-based RAG query with optional streaming."""
|
||||
) -> Union[TextCompletionResult, Iterator[RAGChunk]]:
|
||||
"""Execute graph-based RAG query with optional streaming.
|
||||
|
||||
Non-streaming: returns a TextCompletionResult with text and token counts.
|
||||
Streaming: returns an iterator of RAGChunk (with token counts on the final chunk).
|
||||
"""
|
||||
request = {
|
||||
"query": query,
|
||||
"user": user,
|
||||
|
|
@ -598,7 +617,12 @@ class SocketFlowInstance:
|
|||
if streaming:
|
||||
return self._rag_generator(result)
|
||||
else:
|
||||
return result.get("response", "")
|
||||
return TextCompletionResult(
|
||||
text=result.get("response", ""),
|
||||
in_token=result.get("in_token"),
|
||||
out_token=result.get("out_token"),
|
||||
model=result.get("model"),
|
||||
)
|
||||
|
||||
def graph_rag_explain(
|
||||
self,
|
||||
|
|
@ -642,8 +666,12 @@ class SocketFlowInstance:
|
|||
doc_limit: int = 10,
|
||||
streaming: bool = False,
|
||||
**kwargs: Any
|
||||
) -> Union[str, Iterator[str]]:
|
||||
"""Execute document-based RAG query with optional streaming."""
|
||||
) -> Union[TextCompletionResult, Iterator[RAGChunk]]:
|
||||
"""Execute document-based RAG query with optional streaming.
|
||||
|
||||
Non-streaming: returns a TextCompletionResult with text and token counts.
|
||||
Streaming: returns an iterator of RAGChunk (with token counts on the final chunk).
|
||||
"""
|
||||
request = {
|
||||
"query": query,
|
||||
"user": user,
|
||||
|
|
@ -658,7 +686,12 @@ class SocketFlowInstance:
|
|||
if streaming:
|
||||
return self._rag_generator(result)
|
||||
else:
|
||||
return result.get("response", "")
|
||||
return TextCompletionResult(
|
||||
text=result.get("response", ""),
|
||||
in_token=result.get("in_token"),
|
||||
out_token=result.get("out_token"),
|
||||
model=result.get("model"),
|
||||
)
|
||||
|
||||
def document_rag_explain(
|
||||
self,
|
||||
|
|
@ -684,10 +717,10 @@ class SocketFlowInstance:
|
|||
streaming=True, include_provenance=True
|
||||
)
|
||||
|
||||
def _rag_generator(self, result: Iterator[StreamingChunk]) -> Iterator[str]:
|
||||
def _rag_generator(self, result: Iterator[StreamingChunk]) -> Iterator[RAGChunk]:
|
||||
for chunk in result:
|
||||
if hasattr(chunk, 'content'):
|
||||
yield chunk.content
|
||||
if isinstance(chunk, RAGChunk):
|
||||
yield chunk
|
||||
|
||||
def prompt(
|
||||
self,
|
||||
|
|
@ -695,8 +728,12 @@ class SocketFlowInstance:
|
|||
variables: Dict[str, str],
|
||||
streaming: bool = False,
|
||||
**kwargs: Any
|
||||
) -> Union[str, Iterator[str]]:
|
||||
"""Execute a prompt template with optional streaming."""
|
||||
) -> Union[TextCompletionResult, Iterator[RAGChunk]]:
|
||||
"""Execute a prompt template with optional streaming.
|
||||
|
||||
Non-streaming: returns a TextCompletionResult with text and token counts.
|
||||
Streaming: returns an iterator of RAGChunk (with token counts on the final chunk).
|
||||
"""
|
||||
request = {
|
||||
"id": id,
|
||||
"variables": variables,
|
||||
|
|
@ -709,7 +746,12 @@ class SocketFlowInstance:
|
|||
if streaming:
|
||||
return self._rag_generator(result)
|
||||
else:
|
||||
return result.get("response", "")
|
||||
return TextCompletionResult(
|
||||
text=result.get("text", result.get("response", "")),
|
||||
in_token=result.get("in_token"),
|
||||
out_token=result.get("out_token"),
|
||||
model=result.get("model"),
|
||||
)
|
||||
|
||||
def graph_embeddings_query(
|
||||
self,
|
||||
|
|
|
|||
|
|
@ -189,6 +189,9 @@ class AgentAnswer(StreamingChunk):
|
|||
chunk_type: str = "final-answer"
|
||||
end_of_dialog: bool = False
|
||||
message_id: str = ""
|
||||
in_token: Optional[int] = None
|
||||
out_token: Optional[int] = None
|
||||
model: Optional[str] = None
|
||||
|
||||
@dataclasses.dataclass
|
||||
class RAGChunk(StreamingChunk):
|
||||
|
|
@ -202,11 +205,37 @@ class RAGChunk(StreamingChunk):
|
|||
content: Generated text content
|
||||
end_of_stream: True if this is the final chunk of the stream
|
||||
error: Optional error information if an error occurred
|
||||
in_token: Input token count (populated on the final chunk, 0 otherwise)
|
||||
out_token: Output token count (populated on the final chunk, 0 otherwise)
|
||||
model: Model identifier (populated on the final chunk, empty otherwise)
|
||||
chunk_type: Always "rag"
|
||||
"""
|
||||
chunk_type: str = "rag"
|
||||
end_of_stream: bool = False
|
||||
error: Optional[Dict[str, str]] = None
|
||||
in_token: Optional[int] = None
|
||||
out_token: Optional[int] = None
|
||||
model: Optional[str] = None
|
||||
|
||||
@dataclasses.dataclass
|
||||
class TextCompletionResult:
|
||||
"""
|
||||
Result from a text completion request.
|
||||
|
||||
Returned by text_completion() in both streaming and non-streaming modes.
|
||||
In streaming mode, text is None (chunks are delivered via the iterator).
|
||||
In non-streaming mode, text contains the complete response.
|
||||
|
||||
Attributes:
|
||||
text: Complete response text (None in streaming mode)
|
||||
in_token: Input token count (None if not available)
|
||||
out_token: Output token count (None if not available)
|
||||
model: Model identifier (None if not available)
|
||||
"""
|
||||
text: Optional[str]
|
||||
in_token: Optional[int] = None
|
||||
out_token: Optional[int] = None
|
||||
model: Optional[str] = None
|
||||
|
||||
@dataclasses.dataclass
|
||||
class ProvenanceEvent:
|
||||
|
|
|
|||
|
|
@ -18,8 +18,10 @@ from . librarian_client import LibrarianClient
|
|||
from . chunking_service import ChunkingService
|
||||
from . embeddings_service import EmbeddingsService
|
||||
from . embeddings_client import EmbeddingsClientSpec
|
||||
from . text_completion_client import TextCompletionClientSpec
|
||||
from . prompt_client import PromptClientSpec
|
||||
from . text_completion_client import (
|
||||
TextCompletionClientSpec, TextCompletionClient, TextCompletionResult,
|
||||
)
|
||||
from . prompt_client import PromptClientSpec, PromptClient, PromptResult
|
||||
from . triples_store_service import TriplesStoreService
|
||||
from . graph_embeddings_store_service import GraphEmbeddingsStoreService
|
||||
from . document_embeddings_store_service import DocumentEmbeddingsStoreService
|
||||
|
|
|
|||
|
|
@ -1,10 +1,22 @@
|
|||
|
||||
import json
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Any
|
||||
|
||||
from . request_response_spec import RequestResponse, RequestResponseSpec
|
||||
from .. schema import PromptRequest, PromptResponse
|
||||
|
||||
@dataclass
|
||||
class PromptResult:
|
||||
response_type: str # "text", "json", or "jsonl"
|
||||
text: Optional[str] = None # populated for "text"
|
||||
object: Any = None # populated for "json"
|
||||
objects: Optional[list] = None # populated for "jsonl"
|
||||
in_token: Optional[int] = None
|
||||
out_token: Optional[int] = None
|
||||
model: Optional[str] = None
|
||||
|
||||
class PromptClient(RequestResponse):
|
||||
|
||||
async def prompt(self, id, variables, timeout=600, streaming=False, chunk_callback=None):
|
||||
|
|
@ -26,17 +38,40 @@ class PromptClient(RequestResponse):
|
|||
if resp.error:
|
||||
raise RuntimeError(resp.error.message)
|
||||
|
||||
if resp.text: return resp.text
|
||||
if resp.text:
|
||||
return PromptResult(
|
||||
response_type="text",
|
||||
text=resp.text,
|
||||
in_token=resp.in_token,
|
||||
out_token=resp.out_token,
|
||||
model=resp.model,
|
||||
)
|
||||
|
||||
return json.loads(resp.object)
|
||||
parsed = json.loads(resp.object)
|
||||
|
||||
if isinstance(parsed, list):
|
||||
return PromptResult(
|
||||
response_type="jsonl",
|
||||
objects=parsed,
|
||||
in_token=resp.in_token,
|
||||
out_token=resp.out_token,
|
||||
model=resp.model,
|
||||
)
|
||||
|
||||
return PromptResult(
|
||||
response_type="json",
|
||||
object=parsed,
|
||||
in_token=resp.in_token,
|
||||
out_token=resp.out_token,
|
||||
model=resp.model,
|
||||
)
|
||||
|
||||
else:
|
||||
|
||||
last_text = ""
|
||||
last_object = None
|
||||
last_resp = None
|
||||
|
||||
async def forward_chunks(resp):
|
||||
nonlocal last_text, last_object
|
||||
nonlocal last_resp
|
||||
|
||||
if resp.error:
|
||||
raise RuntimeError(resp.error.message)
|
||||
|
|
@ -44,14 +79,13 @@ class PromptClient(RequestResponse):
|
|||
end_stream = getattr(resp, 'end_of_stream', False)
|
||||
|
||||
if resp.text is not None:
|
||||
last_text = resp.text
|
||||
if chunk_callback:
|
||||
if asyncio.iscoroutinefunction(chunk_callback):
|
||||
await chunk_callback(resp.text, end_stream)
|
||||
else:
|
||||
chunk_callback(resp.text, end_stream)
|
||||
elif resp.object:
|
||||
last_object = resp.object
|
||||
|
||||
last_resp = resp
|
||||
|
||||
return end_stream
|
||||
|
||||
|
|
@ -70,10 +104,36 @@ class PromptClient(RequestResponse):
|
|||
timeout=timeout
|
||||
)
|
||||
|
||||
if last_text:
|
||||
return last_text
|
||||
if last_resp is None:
|
||||
return PromptResult(response_type="text")
|
||||
|
||||
return json.loads(last_object) if last_object else None
|
||||
if last_resp.object:
|
||||
parsed = json.loads(last_resp.object)
|
||||
|
||||
if isinstance(parsed, list):
|
||||
return PromptResult(
|
||||
response_type="jsonl",
|
||||
objects=parsed,
|
||||
in_token=last_resp.in_token,
|
||||
out_token=last_resp.out_token,
|
||||
model=last_resp.model,
|
||||
)
|
||||
|
||||
return PromptResult(
|
||||
response_type="json",
|
||||
object=parsed,
|
||||
in_token=last_resp.in_token,
|
||||
out_token=last_resp.out_token,
|
||||
model=last_resp.model,
|
||||
)
|
||||
|
||||
return PromptResult(
|
||||
response_type="text",
|
||||
text=last_resp.text,
|
||||
in_token=last_resp.in_token,
|
||||
out_token=last_resp.out_token,
|
||||
model=last_resp.model,
|
||||
)
|
||||
|
||||
async def extract_definitions(self, text, timeout=600):
|
||||
return await self.prompt(
|
||||
|
|
@ -152,4 +212,3 @@ class PromptClientSpec(RequestResponseSpec):
|
|||
response_schema = PromptResponse,
|
||||
impl = PromptClient,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,47 +1,71 @@
|
|||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
from . request_response_spec import RequestResponse, RequestResponseSpec
|
||||
from .. schema import TextCompletionRequest, TextCompletionResponse
|
||||
|
||||
@dataclass
|
||||
class TextCompletionResult:
|
||||
text: Optional[str]
|
||||
in_token: Optional[int] = None
|
||||
out_token: Optional[int] = None
|
||||
model: Optional[str] = None
|
||||
|
||||
class TextCompletionClient(RequestResponse):
|
||||
async def text_completion(self, system, prompt, streaming=False, timeout=600):
|
||||
# If not streaming, use original behavior
|
||||
if not streaming:
|
||||
resp = await self.request(
|
||||
TextCompletionRequest(
|
||||
system = system, prompt = prompt, streaming = False
|
||||
),
|
||||
timeout=timeout
|
||||
)
|
||||
|
||||
if resp.error:
|
||||
raise RuntimeError(resp.error.message)
|
||||
async def text_completion(self, system, prompt, timeout=600):
|
||||
|
||||
return resp.response
|
||||
|
||||
# For streaming: collect all chunks and return complete response
|
||||
full_response = ""
|
||||
|
||||
async def collect_chunks(resp):
|
||||
nonlocal full_response
|
||||
|
||||
if resp.error:
|
||||
raise RuntimeError(resp.error.message)
|
||||
|
||||
if resp.response:
|
||||
full_response += resp.response
|
||||
|
||||
# Return True when end_of_stream is reached
|
||||
return getattr(resp, 'end_of_stream', False)
|
||||
|
||||
await self.request(
|
||||
resp = await self.request(
|
||||
TextCompletionRequest(
|
||||
system = system, prompt = prompt, streaming = True
|
||||
system = system, prompt = prompt, streaming = False
|
||||
),
|
||||
recipient=collect_chunks,
|
||||
timeout=timeout
|
||||
)
|
||||
|
||||
return full_response
|
||||
if resp.error:
|
||||
raise RuntimeError(resp.error.message)
|
||||
|
||||
return TextCompletionResult(
|
||||
text = resp.response,
|
||||
in_token = resp.in_token,
|
||||
out_token = resp.out_token,
|
||||
model = resp.model,
|
||||
)
|
||||
|
||||
async def text_completion_stream(
|
||||
self, system, prompt, handler, timeout=600,
|
||||
):
|
||||
"""
|
||||
Streaming text completion. `handler` is an async callable invoked
|
||||
once per chunk with the chunk's TextCompletionResponse. Returns a
|
||||
TextCompletionResult with text=None and token counts / model taken
|
||||
from the end_of_stream message.
|
||||
"""
|
||||
|
||||
async def on_chunk(resp):
|
||||
|
||||
if resp.error:
|
||||
raise RuntimeError(resp.error.message)
|
||||
|
||||
await handler(resp)
|
||||
|
||||
return getattr(resp, "end_of_stream", False)
|
||||
|
||||
final = await self.request(
|
||||
TextCompletionRequest(
|
||||
system = system, prompt = prompt, streaming = True
|
||||
),
|
||||
recipient=on_chunk,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
return TextCompletionResult(
|
||||
text = None,
|
||||
in_token = final.in_token,
|
||||
out_token = final.out_token,
|
||||
model = final.model,
|
||||
)
|
||||
|
||||
class TextCompletionClientSpec(RequestResponseSpec):
|
||||
def __init__(
|
||||
|
|
@ -54,4 +78,3 @@ class TextCompletionClientSpec(RequestResponseSpec):
|
|||
response_schema = TextCompletionResponse,
|
||||
impl = TextCompletionClient,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -90,6 +90,13 @@ class AgentResponseTranslator(MessageTranslator):
|
|||
if hasattr(obj, 'error') and obj.error and obj.error.message:
|
||||
result["error"] = {"message": obj.error.message, "code": obj.error.code}
|
||||
|
||||
if obj.in_token is not None:
|
||||
result["in_token"] = obj.in_token
|
||||
if obj.out_token is not None:
|
||||
result["out_token"] = obj.out_token
|
||||
if obj.model is not None:
|
||||
result["model"] = obj.model
|
||||
|
||||
return result
|
||||
|
||||
def encode_with_completion(self, obj: AgentResponse) -> Tuple[Dict[str, Any], bool]:
|
||||
|
|
|
|||
|
|
@ -53,6 +53,13 @@ class PromptResponseTranslator(MessageTranslator):
|
|||
# Always include end_of_stream flag for streaming support
|
||||
result["end_of_stream"] = getattr(obj, "end_of_stream", False)
|
||||
|
||||
if obj.in_token is not None:
|
||||
result["in_token"] = obj.in_token
|
||||
if obj.out_token is not None:
|
||||
result["out_token"] = obj.out_token
|
||||
if obj.model is not None:
|
||||
result["model"] = obj.model
|
||||
|
||||
return result
|
||||
|
||||
def encode_with_completion(self, obj: PromptResponse) -> Tuple[Dict[str, Any], bool]:
|
||||
|
|
|
|||
|
|
@ -74,6 +74,13 @@ class DocumentRagResponseTranslator(MessageTranslator):
|
|||
if hasattr(obj, 'error') and obj.error and obj.error.message:
|
||||
result["error"] = {"message": obj.error.message, "type": obj.error.type}
|
||||
|
||||
if obj.in_token is not None:
|
||||
result["in_token"] = obj.in_token
|
||||
if obj.out_token is not None:
|
||||
result["out_token"] = obj.out_token
|
||||
if obj.model is not None:
|
||||
result["model"] = obj.model
|
||||
|
||||
return result
|
||||
|
||||
def encode_with_completion(self, obj: DocumentRagResponse) -> Tuple[Dict[str, Any], bool]:
|
||||
|
|
@ -163,6 +170,13 @@ class GraphRagResponseTranslator(MessageTranslator):
|
|||
if hasattr(obj, 'error') and obj.error and obj.error.message:
|
||||
result["error"] = {"message": obj.error.message, "type": obj.error.type}
|
||||
|
||||
if obj.in_token is not None:
|
||||
result["in_token"] = obj.in_token
|
||||
if obj.out_token is not None:
|
||||
result["out_token"] = obj.out_token
|
||||
if obj.model is not None:
|
||||
result["model"] = obj.model
|
||||
|
||||
return result
|
||||
|
||||
def encode_with_completion(self, obj: GraphRagResponse) -> Tuple[Dict[str, Any], bool]:
|
||||
|
|
|
|||
|
|
@ -29,11 +29,11 @@ class TextCompletionResponseTranslator(MessageTranslator):
|
|||
def encode(self, obj: TextCompletionResponse) -> Dict[str, Any]:
|
||||
result = {"response": obj.response}
|
||||
|
||||
if obj.in_token:
|
||||
if obj.in_token is not None:
|
||||
result["in_token"] = obj.in_token
|
||||
if obj.out_token:
|
||||
if obj.out_token is not None:
|
||||
result["out_token"] = obj.out_token
|
||||
if obj.model:
|
||||
if obj.model is not None:
|
||||
result["model"] = obj.model
|
||||
|
||||
# Always include end_of_stream flag for streaming support
|
||||
|
|
|
|||
|
|
@ -66,5 +66,10 @@ class AgentResponse:
|
|||
|
||||
error: Error | None = None
|
||||
|
||||
# Token usage (populated on end_of_dialog message)
|
||||
in_token: int | None = None
|
||||
out_token: int | None = None
|
||||
model: str | None = None
|
||||
|
||||
############################################################################
|
||||
|
||||
|
|
|
|||
|
|
@ -17,9 +17,9 @@ class TextCompletionRequest:
|
|||
class TextCompletionResponse:
|
||||
error: Error | None = None
|
||||
response: str = ""
|
||||
in_token: int = 0
|
||||
out_token: int = 0
|
||||
model: str = ""
|
||||
in_token: int | None = None
|
||||
out_token: int | None = None
|
||||
model: str | None = None
|
||||
end_of_stream: bool = False # Indicates final message in stream
|
||||
|
||||
############################################################################
|
||||
|
|
|
|||
|
|
@ -41,4 +41,9 @@ class PromptResponse:
|
|||
# Indicates final message in stream
|
||||
end_of_stream: bool = False
|
||||
|
||||
# Token usage from the underlying text completion
|
||||
in_token: int | None = None
|
||||
out_token: int | None = None
|
||||
model: str | None = None
|
||||
|
||||
############################################################################
|
||||
|
|
@ -29,6 +29,9 @@ class GraphRagResponse:
|
|||
explain_triples: list[Triple] = field(default_factory=list) # Provenance triples for this step
|
||||
message_type: str = "" # "chunk" or "explain"
|
||||
end_of_session: bool = False # Entire session complete
|
||||
in_token: int | None = None
|
||||
out_token: int | None = None
|
||||
model: str | None = None
|
||||
|
||||
############################################################################
|
||||
|
||||
|
|
@ -52,3 +55,6 @@ class DocumentRagResponse:
|
|||
explain_triples: list[Triple] = field(default_factory=list) # Provenance triples for this step
|
||||
message_type: str = "" # "chunk" or "explain"
|
||||
end_of_session: bool = False # Entire session complete
|
||||
in_token: int | None = None
|
||||
out_token: int | None = None
|
||||
model: str | None = None
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue