Fix LLM metrics (#631)

* Fix mistral metrics

* Fix to other models
This commit is contained in:
cybermaggedon 2026-02-09 19:35:42 +00:00 committed by GitHub
parent 4fca97d555
commit 2781c7d87c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 100 additions and 27 deletions

View file

@ -75,6 +75,7 @@ class Processor(LlmService):
if stream:
data["stream"] = True
data["stream_options"] = {"include_usage": True}
body = json.dumps(data)
@ -191,6 +192,9 @@ class Processor(LlmService):
if response.status_code != 200:
raise RuntimeError("LLM failure")
total_input_tokens = 0
total_output_tokens = 0
# Parse SSE stream
for line in response.iter_lines():
if line:
@ -215,15 +219,21 @@ class Processor(LlmService):
model=model_name,
is_final=False
)
# Capture usage from final chunk
if 'usage' in chunk_data and chunk_data['usage']:
total_input_tokens = chunk_data['usage'].get('prompt_tokens', 0)
total_output_tokens = chunk_data['usage'].get('completion_tokens', 0)
except json.JSONDecodeError:
logger.warning(f"Failed to parse chunk: {data}")
continue
# Send final chunk
# Send final chunk with token counts
yield LlmChunk(
text="",
in_token=None,
out_token=None,
in_token=total_input_tokens,
out_token=total_output_tokens,
model=model_name,
is_final=True
)

View file

@ -161,9 +161,13 @@ class Processor(LlmService):
temperature=effective_temperature,
max_tokens=self.max_output,
top_p=1,
stream=True # Enable streaming
stream=True,
stream_options={"include_usage": True}
)
total_input_tokens = 0
total_output_tokens = 0
# Stream chunks
for chunk in response:
if chunk.choices and chunk.choices[0].delta.content:
@ -175,11 +179,16 @@ class Processor(LlmService):
is_final=False
)
# Send final chunk
# Capture usage from final chunk
if chunk.usage:
total_input_tokens = chunk.usage.prompt_tokens
total_output_tokens = chunk.usage.completion_tokens
# Send final chunk with token counts
yield LlmChunk(
text="",
in_token=None,
out_token=None,
in_token=total_input_tokens,
out_token=total_output_tokens,
model=model_name,
is_final=True
)

View file

@ -126,9 +126,13 @@ class Processor(LlmService):
frequency_penalty=0,
presence_penalty=0,
response_format={"type": "text"},
stream=True
stream=True,
stream_options={"include_usage": True}
)
total_input_tokens = 0
total_output_tokens = 0
for chunk in response:
if chunk.choices and chunk.choices[0].delta.content:
yield LlmChunk(
@ -139,10 +143,15 @@ class Processor(LlmService):
is_final=False
)
# Capture usage from final chunk
if chunk.usage:
total_input_tokens = chunk.usage.prompt_tokens
total_output_tokens = chunk.usage.completion_tokens
yield LlmChunk(
text="",
in_token=None,
out_token=None,
in_token=total_input_tokens,
out_token=total_output_tokens,
model=model_name,
is_final=True
)

View file

@ -130,9 +130,13 @@ class Processor(LlmService):
frequency_penalty=0,
presence_penalty=0,
response_format={"type": "text"},
stream=True
stream=True,
stream_options={"include_usage": True}
)
total_input_tokens = 0
total_output_tokens = 0
for chunk in response:
if chunk.choices and chunk.choices[0].delta.content:
yield LlmChunk(
@ -143,10 +147,15 @@ class Processor(LlmService):
is_final=False
)
# Capture usage from final chunk
if chunk.usage:
total_input_tokens = chunk.usage.prompt_tokens
total_output_tokens = chunk.usage.completion_tokens
yield LlmChunk(
text="",
in_token=None,
out_token=None,
in_token=total_input_tokens,
out_token=total_output_tokens,
model=model_name,
is_final=True
)

View file

@ -156,6 +156,9 @@ class Processor(LlmService):
response_format={"type": "text"}
)
total_input_tokens = 0
total_output_tokens = 0
for chunk in stream:
if chunk.data.choices and chunk.data.choices[0].delta.content:
yield LlmChunk(
@ -166,11 +169,16 @@ class Processor(LlmService):
is_final=False
)
# Send final chunk
# Capture usage data when available (typically in final chunk)
if chunk.data.usage:
total_input_tokens = chunk.data.usage.prompt_tokens
total_output_tokens = chunk.data.usage.completion_tokens
# Send final chunk with token counts
yield LlmChunk(
text="",
in_token=None,
out_token=None,
in_token=total_input_tokens,
out_token=total_output_tokens,
model=model_name,
is_final=True
)

View file

@ -153,9 +153,13 @@ class Processor(LlmService):
],
temperature=effective_temperature,
max_tokens=self.max_output,
stream=True # Enable streaming
stream=True,
stream_options={"include_usage": True}
)
total_input_tokens = 0
total_output_tokens = 0
# Stream chunks
for chunk in response:
if chunk.choices and chunk.choices[0].delta.content:
@ -167,12 +171,16 @@ class Processor(LlmService):
is_final=False
)
# Note: OpenAI doesn't provide token counts in streaming mode
# Send final chunk without token counts
# Capture usage from final chunk
if chunk.usage:
total_input_tokens = chunk.usage.prompt_tokens
total_output_tokens = chunk.usage.completion_tokens
# Send final chunk with token counts
yield LlmChunk(
text="",
in_token=None,
out_token=None,
in_token=total_input_tokens,
out_token=total_output_tokens,
model=model_name,
is_final=True
)

View file

@ -152,11 +152,15 @@ class Processor(LlmService):
"max_tokens": self.max_output,
"temperature": effective_temperature,
"stream": True,
"stream_options": {"include_usage": True},
}
try:
url = f"{self.base_url.rstrip('/')}/chat/completions"
total_input_tokens = 0
total_output_tokens = 0
async with self.session.post(
url,
headers=headers,
@ -196,15 +200,21 @@ class Processor(LlmService):
model=model_name,
is_final=False
)
# Capture usage from final chunk
if 'usage' in chunk_data and chunk_data['usage']:
total_input_tokens = chunk_data['usage'].get('prompt_tokens', 0)
total_output_tokens = chunk_data['usage'].get('completion_tokens', 0)
except json.JSONDecodeError:
logger.warning(f"Failed to parse chunk: {data}")
continue
# Send final chunk
# Send final chunk with token counts
yield LlmChunk(
text="",
in_token=None,
out_token=None,
in_token=total_input_tokens,
out_token=total_output_tokens,
model=model_name,
is_final=True
)

View file

@ -135,11 +135,15 @@ class Processor(LlmService):
"max_tokens": self.max_output,
"temperature": effective_temperature,
"stream": True,
"stream_options": {"include_usage": True},
}
try:
url = f"{self.base_url.rstrip('/')}/completions"
total_input_tokens = 0
total_output_tokens = 0
async with self.session.post(
url,
headers=headers,
@ -177,15 +181,21 @@ class Processor(LlmService):
model=model_name,
is_final=False
)
# Capture usage from final chunk
if 'usage' in chunk_data and chunk_data['usage']:
total_input_tokens = chunk_data['usage'].get('prompt_tokens', 0)
total_output_tokens = chunk_data['usage'].get('completion_tokens', 0)
except json.JSONDecodeError:
logger.warning(f"Failed to parse chunk: {data}")
continue
# Send final chunk
# Send final chunk with token counts
yield LlmChunk(
text="",
in_token=None,
out_token=None,
in_token=total_input_tokens,
out_token=total_output_tokens,
model=model_name,
is_final=True
)