diff --git a/trustgraph-flow/trustgraph/model/text_completion/azure/llm.py b/trustgraph-flow/trustgraph/model/text_completion/azure/llm.py index 614c1362..4e3db7f9 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/azure/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/azure/llm.py @@ -75,6 +75,7 @@ class Processor(LlmService): if stream: data["stream"] = True + data["stream_options"] = {"include_usage": True} body = json.dumps(data) @@ -191,6 +192,9 @@ class Processor(LlmService): if response.status_code != 200: raise RuntimeError("LLM failure") + total_input_tokens = 0 + total_output_tokens = 0 + # Parse SSE stream for line in response.iter_lines(): if line: @@ -215,15 +219,21 @@ class Processor(LlmService): model=model_name, is_final=False ) + + # Capture usage from final chunk + if 'usage' in chunk_data and chunk_data['usage']: + total_input_tokens = chunk_data['usage'].get('prompt_tokens', 0) + total_output_tokens = chunk_data['usage'].get('completion_tokens', 0) + except json.JSONDecodeError: logger.warning(f"Failed to parse chunk: {data}") continue - # Send final chunk + # Send final chunk with token counts yield LlmChunk( text="", - in_token=None, - out_token=None, + in_token=total_input_tokens, + out_token=total_output_tokens, model=model_name, is_final=True ) diff --git a/trustgraph-flow/trustgraph/model/text_completion/azure_openai/llm.py b/trustgraph-flow/trustgraph/model/text_completion/azure_openai/llm.py index 950c006a..4ab0b302 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/azure_openai/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/azure_openai/llm.py @@ -161,9 +161,13 @@ class Processor(LlmService): temperature=effective_temperature, max_tokens=self.max_output, top_p=1, - stream=True # Enable streaming + stream=True, + stream_options={"include_usage": True} ) + total_input_tokens = 0 + total_output_tokens = 0 + # Stream chunks for chunk in response: if chunk.choices and chunk.choices[0].delta.content: @@ -175,11 +179,16 @@ class Processor(LlmService): is_final=False ) - # Send final chunk + # Capture usage from final chunk + if chunk.usage: + total_input_tokens = chunk.usage.prompt_tokens + total_output_tokens = chunk.usage.completion_tokens + + # Send final chunk with token counts yield LlmChunk( text="", - in_token=None, - out_token=None, + in_token=total_input_tokens, + out_token=total_output_tokens, model=model_name, is_final=True ) diff --git a/trustgraph-flow/trustgraph/model/text_completion/llamafile/llm.py b/trustgraph-flow/trustgraph/model/text_completion/llamafile/llm.py index 801ed067..276727b5 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/llamafile/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/llamafile/llm.py @@ -126,9 +126,13 @@ class Processor(LlmService): frequency_penalty=0, presence_penalty=0, response_format={"type": "text"}, - stream=True + stream=True, + stream_options={"include_usage": True} ) + total_input_tokens = 0 + total_output_tokens = 0 + for chunk in response: if chunk.choices and chunk.choices[0].delta.content: yield LlmChunk( @@ -139,10 +143,15 @@ class Processor(LlmService): is_final=False ) + # Capture usage from final chunk + if chunk.usage: + total_input_tokens = chunk.usage.prompt_tokens + total_output_tokens = chunk.usage.completion_tokens + yield LlmChunk( text="", - in_token=None, - out_token=None, + in_token=total_input_tokens, + out_token=total_output_tokens, model=model_name, is_final=True ) diff --git a/trustgraph-flow/trustgraph/model/text_completion/lmstudio/llm.py b/trustgraph-flow/trustgraph/model/text_completion/lmstudio/llm.py index 078a890e..b057f58d 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/lmstudio/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/lmstudio/llm.py @@ -130,9 +130,13 @@ class Processor(LlmService): frequency_penalty=0, presence_penalty=0, response_format={"type": "text"}, - stream=True + stream=True, + stream_options={"include_usage": True} ) + total_input_tokens = 0 + total_output_tokens = 0 + for chunk in response: if chunk.choices and chunk.choices[0].delta.content: yield LlmChunk( @@ -143,10 +147,15 @@ class Processor(LlmService): is_final=False ) + # Capture usage from final chunk + if chunk.usage: + total_input_tokens = chunk.usage.prompt_tokens + total_output_tokens = chunk.usage.completion_tokens + yield LlmChunk( text="", - in_token=None, - out_token=None, + in_token=total_input_tokens, + out_token=total_output_tokens, model=model_name, is_final=True ) diff --git a/trustgraph-flow/trustgraph/model/text_completion/mistral/llm.py b/trustgraph-flow/trustgraph/model/text_completion/mistral/llm.py index 7952b1df..fab41ecd 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/mistral/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/mistral/llm.py @@ -156,6 +156,9 @@ class Processor(LlmService): response_format={"type": "text"} ) + total_input_tokens = 0 + total_output_tokens = 0 + for chunk in stream: if chunk.data.choices and chunk.data.choices[0].delta.content: yield LlmChunk( @@ -166,11 +169,16 @@ class Processor(LlmService): is_final=False ) - # Send final chunk + # Capture usage data when available (typically in final chunk) + if chunk.data.usage: + total_input_tokens = chunk.data.usage.prompt_tokens + total_output_tokens = chunk.data.usage.completion_tokens + + # Send final chunk with token counts yield LlmChunk( text="", - in_token=None, - out_token=None, + in_token=total_input_tokens, + out_token=total_output_tokens, model=model_name, is_final=True ) diff --git a/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py b/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py index 4da1378b..d65e27bf 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py @@ -153,9 +153,13 @@ class Processor(LlmService): ], temperature=effective_temperature, max_tokens=self.max_output, - stream=True # Enable streaming + stream=True, + stream_options={"include_usage": True} ) + total_input_tokens = 0 + total_output_tokens = 0 + # Stream chunks for chunk in response: if chunk.choices and chunk.choices[0].delta.content: @@ -167,12 +171,16 @@ class Processor(LlmService): is_final=False ) - # Note: OpenAI doesn't provide token counts in streaming mode - # Send final chunk without token counts + # Capture usage from final chunk + if chunk.usage: + total_input_tokens = chunk.usage.prompt_tokens + total_output_tokens = chunk.usage.completion_tokens + + # Send final chunk with token counts yield LlmChunk( text="", - in_token=None, - out_token=None, + in_token=total_input_tokens, + out_token=total_output_tokens, model=model_name, is_final=True ) diff --git a/trustgraph-flow/trustgraph/model/text_completion/tgi/llm.py b/trustgraph-flow/trustgraph/model/text_completion/tgi/llm.py index ca6da1ba..5caeb9be 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/tgi/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/tgi/llm.py @@ -152,11 +152,15 @@ class Processor(LlmService): "max_tokens": self.max_output, "temperature": effective_temperature, "stream": True, + "stream_options": {"include_usage": True}, } try: url = f"{self.base_url.rstrip('/')}/chat/completions" + total_input_tokens = 0 + total_output_tokens = 0 + async with self.session.post( url, headers=headers, @@ -196,15 +200,21 @@ class Processor(LlmService): model=model_name, is_final=False ) + + # Capture usage from final chunk + if 'usage' in chunk_data and chunk_data['usage']: + total_input_tokens = chunk_data['usage'].get('prompt_tokens', 0) + total_output_tokens = chunk_data['usage'].get('completion_tokens', 0) + except json.JSONDecodeError: logger.warning(f"Failed to parse chunk: {data}") continue - # Send final chunk + # Send final chunk with token counts yield LlmChunk( text="", - in_token=None, - out_token=None, + in_token=total_input_tokens, + out_token=total_output_tokens, model=model_name, is_final=True ) diff --git a/trustgraph-flow/trustgraph/model/text_completion/vllm/llm.py b/trustgraph-flow/trustgraph/model/text_completion/vllm/llm.py index 8d832b5c..2dd4576e 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/vllm/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/vllm/llm.py @@ -135,11 +135,15 @@ class Processor(LlmService): "max_tokens": self.max_output, "temperature": effective_temperature, "stream": True, + "stream_options": {"include_usage": True}, } try: url = f"{self.base_url.rstrip('/')}/completions" + total_input_tokens = 0 + total_output_tokens = 0 + async with self.session.post( url, headers=headers, @@ -177,15 +181,21 @@ class Processor(LlmService): model=model_name, is_final=False ) + + # Capture usage from final chunk + if 'usage' in chunk_data and chunk_data['usage']: + total_input_tokens = chunk_data['usage'].get('prompt_tokens', 0) + total_output_tokens = chunk_data['usage'].get('completion_tokens', 0) + except json.JSONDecodeError: logger.warning(f"Failed to parse chunk: {data}") continue - # Send final chunk + # Send final chunk with token counts yield LlmChunk( text="", - in_token=None, - out_token=None, + in_token=total_input_tokens, + out_token=total_output_tokens, model=model_name, is_final=True )