Fix LLM metrics (#631)

* Fix mistral metrics * Fix to other models
2026-06-09 06:45:13 +02:00 · 2026-02-09 19:35:42 +00:00 · 2026-02-09 19:35:42 +00:00 · 2781c7d87c
commit 2781c7d87c
parent 4fca97d555
8 changed files with 100 additions and 27 deletions
--- a/trustgraph-flow/trustgraph/model/text_completion/azure/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/azure/llm.py
@ -75,6 +75,7 @@ class Processor(LlmService):

        if stream:
            data["stream"] = True
+            data["stream_options"] = {"include_usage": True}

        body = json.dumps(data)

@ -191,6 +192,9 @@ class Processor(LlmService):
            if response.status_code != 200:
                raise RuntimeError("LLM failure")

+            total_input_tokens = 0
+            total_output_tokens = 0
+
            # Parse SSE stream
            for line in response.iter_lines():
                if line:
@ -215,15 +219,21 @@ class Processor(LlmService):
                                        model=model_name,
                                        is_final=False
                                    )
+
+                            # Capture usage from final chunk
+                            if 'usage' in chunk_data and chunk_data['usage']:
+                                total_input_tokens = chunk_data['usage'].get('prompt_tokens', 0)
+                                total_output_tokens = chunk_data['usage'].get('completion_tokens', 0)
+
                        except json.JSONDecodeError:
                            logger.warning(f"Failed to parse chunk: {data}")
                            continue

-            # Send final chunk
+            # Send final chunk with token counts
            yield LlmChunk(
                text="",
-                in_token=None,
-                out_token=None,
+                in_token=total_input_tokens,
+                out_token=total_output_tokens,
                model=model_name,
                is_final=True
            )
--- a/trustgraph-flow/trustgraph/model/text_completion/azure_openai/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/azure_openai/llm.py
@ -161,9 +161,13 @@ class Processor(LlmService):
                temperature=effective_temperature,
                max_tokens=self.max_output,
                top_p=1,
-                stream=True  # Enable streaming
+                stream=True,
+                stream_options={"include_usage": True}
            )

+            total_input_tokens = 0
+            total_output_tokens = 0
+
            # Stream chunks
            for chunk in response:
                if chunk.choices and chunk.choices[0].delta.content:
@ -175,11 +179,16 @@ class Processor(LlmService):
                        is_final=False
                    )

-            # Send final chunk
+                # Capture usage from final chunk
+                if chunk.usage:
+                    total_input_tokens = chunk.usage.prompt_tokens
+                    total_output_tokens = chunk.usage.completion_tokens
+
+            # Send final chunk with token counts
            yield LlmChunk(
                text="",
-                in_token=None,
-                out_token=None,
+                in_token=total_input_tokens,
+                out_token=total_output_tokens,
                model=model_name,
                is_final=True
            )
--- a/trustgraph-flow/trustgraph/model/text_completion/llamafile/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/llamafile/llm.py
@ -126,9 +126,13 @@ class Processor(LlmService):
                frequency_penalty=0,
                presence_penalty=0,
                response_format={"type": "text"},
-                stream=True
+                stream=True,
+                stream_options={"include_usage": True}
            )

+            total_input_tokens = 0
+            total_output_tokens = 0
+
            for chunk in response:
                if chunk.choices and chunk.choices[0].delta.content:
                    yield LlmChunk(
@ -139,10 +143,15 @@ class Processor(LlmService):
                        is_final=False
                    )

+                # Capture usage from final chunk
+                if chunk.usage:
+                    total_input_tokens = chunk.usage.prompt_tokens
+                    total_output_tokens = chunk.usage.completion_tokens
+
            yield LlmChunk(
                text="",
-                in_token=None,
-                out_token=None,
+                in_token=total_input_tokens,
+                out_token=total_output_tokens,
                model=model_name,
                is_final=True
            )
--- a/trustgraph-flow/trustgraph/model/text_completion/lmstudio/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/lmstudio/llm.py
@ -130,9 +130,13 @@ class Processor(LlmService):
                frequency_penalty=0,
                presence_penalty=0,
                response_format={"type": "text"},
-                stream=True
+                stream=True,
+                stream_options={"include_usage": True}
            )

+            total_input_tokens = 0
+            total_output_tokens = 0
+
            for chunk in response:
                if chunk.choices and chunk.choices[0].delta.content:
                    yield LlmChunk(
@ -143,10 +147,15 @@ class Processor(LlmService):
                        is_final=False
                    )

+                # Capture usage from final chunk
+                if chunk.usage:
+                    total_input_tokens = chunk.usage.prompt_tokens
+                    total_output_tokens = chunk.usage.completion_tokens
+
            yield LlmChunk(
                text="",
-                in_token=None,
-                out_token=None,
+                in_token=total_input_tokens,
+                out_token=total_output_tokens,
                model=model_name,
                is_final=True
            )
--- a/trustgraph-flow/trustgraph/model/text_completion/mistral/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/mistral/llm.py
@ -156,6 +156,9 @@ class Processor(LlmService):
                response_format={"type": "text"}
            )

+            total_input_tokens = 0
+            total_output_tokens = 0
+
            for chunk in stream:
                if chunk.data.choices and chunk.data.choices[0].delta.content:
                    yield LlmChunk(
@ -166,11 +169,16 @@ class Processor(LlmService):
                        is_final=False
                    )

-            # Send final chunk
+                # Capture usage data when available (typically in final chunk)
+                if chunk.data.usage:
+                    total_input_tokens = chunk.data.usage.prompt_tokens
+                    total_output_tokens = chunk.data.usage.completion_tokens
+
+            # Send final chunk with token counts
            yield LlmChunk(
                text="",
-                in_token=None,
-                out_token=None,
+                in_token=total_input_tokens,
+                out_token=total_output_tokens,
                model=model_name,
                is_final=True
            )
--- a/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py
@ -153,9 +153,13 @@ class Processor(LlmService):
                ],
                temperature=effective_temperature,
                max_tokens=self.max_output,
-                stream=True  # Enable streaming
+                stream=True,
+                stream_options={"include_usage": True}
            )

+            total_input_tokens = 0
+            total_output_tokens = 0
+
            # Stream chunks
            for chunk in response:
                if chunk.choices and chunk.choices[0].delta.content:
@ -167,12 +171,16 @@ class Processor(LlmService):
                        is_final=False
                    )

-            # Note: OpenAI doesn't provide token counts in streaming mode
-            # Send final chunk without token counts
+                # Capture usage from final chunk
+                if chunk.usage:
+                    total_input_tokens = chunk.usage.prompt_tokens
+                    total_output_tokens = chunk.usage.completion_tokens
+
+            # Send final chunk with token counts
            yield LlmChunk(
                text="",
-                in_token=None,
-                out_token=None,
+                in_token=total_input_tokens,
+                out_token=total_output_tokens,
                model=model_name,
                is_final=True
            )
--- a/trustgraph-flow/trustgraph/model/text_completion/tgi/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/tgi/llm.py
@ -152,11 +152,15 @@ class Processor(LlmService):
            "max_tokens": self.max_output,
            "temperature": effective_temperature,
            "stream": True,
+            "stream_options": {"include_usage": True},
        }

        try:
            url = f"{self.base_url.rstrip('/')}/chat/completions"

+            total_input_tokens = 0
+            total_output_tokens = 0
+
            async with self.session.post(
                    url,
                    headers=headers,
@ -196,15 +200,21 @@ class Processor(LlmService):
                                            model=model_name,
                                            is_final=False
                                        )
+
+                            # Capture usage from final chunk
+                            if 'usage' in chunk_data and chunk_data['usage']:
+                                total_input_tokens = chunk_data['usage'].get('prompt_tokens', 0)
+                                total_output_tokens = chunk_data['usage'].get('completion_tokens', 0)
+
                        except json.JSONDecodeError:
                            logger.warning(f"Failed to parse chunk: {data}")
                            continue

-                # Send final chunk
+                # Send final chunk with token counts
                yield LlmChunk(
                    text="",
-                    in_token=None,
-                    out_token=None,
+                    in_token=total_input_tokens,
+                    out_token=total_output_tokens,
                    model=model_name,
                    is_final=True
                )
--- a/trustgraph-flow/trustgraph/model/text_completion/vllm/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/vllm/llm.py
@ -135,11 +135,15 @@ class Processor(LlmService):
            "max_tokens": self.max_output,
            "temperature": effective_temperature,
            "stream": True,
+            "stream_options": {"include_usage": True},
        }

        try:
            url = f"{self.base_url.rstrip('/')}/completions"

+            total_input_tokens = 0
+            total_output_tokens = 0
+
            async with self.session.post(
                    url,
                    headers=headers,
@ -177,15 +181,21 @@ class Processor(LlmService):
                                        model=model_name,
                                        is_final=False
                                    )
+
+                            # Capture usage from final chunk
+                            if 'usage' in chunk_data and chunk_data['usage']:
+                                total_input_tokens = chunk_data['usage'].get('prompt_tokens', 0)
+                                total_output_tokens = chunk_data['usage'].get('completion_tokens', 0)
+
                        except json.JSONDecodeError:
                            logger.warning(f"Failed to parse chunk: {data}")
                            continue

-                # Send final chunk
+                # Send final chunk with token counts
                yield LlmChunk(
                    text="",
-                    in_token=None,
-                    out_token=None,
+                    in_token=total_input_tokens,
+                    out_token=total_output_tokens,
                    model=model_name,
                    is_final=True
                )