adding usage metrics to /v1 endpoints if stream == True

This commit is contained in:
Alpha Nerd 2025-11-21 09:56:42 +01:00
parent 45d1d442ee
commit 7b50a5a299
2 changed files with 26 additions and 9 deletions

4
db.py
View file

@ -1,6 +1,4 @@
import aiosqlite
import os
import asyncio
import aiosqlite, os, asyncio
from pathlib import Path
from datetime import datetime, timezone
from collections import defaultdict

View file

@ -1596,7 +1596,7 @@ async def openai_chat_completions_proxy(request: Request):
optional_params = {
"tools": tools,
"response_format": response_format,
"stream_options": stream_options,
"stream_options": stream_options or {"include_usage": True },
"max_completion_tokens": max_completion_tokens,
"max_tokens": max_tokens,
"temperature": temperature,
@ -1638,8 +1638,17 @@ async def openai_chat_completions_proxy(request: Request):
if hasattr(chunk, "model_dump_json")
else orjson.dumps(chunk)
)
if chunk.choices:
if chunk.choices[0].delta.content is not None:
yield f"data: {data}\n\n".encode("utf-8")
if chunk.usage is not None:
prompt_tok = chunk.usage.prompt_tokens or 0
comp_tok = chunk.usage.completion_tokens or 0
if prompt_tok != 0 or comp_tok != 0:
if not is_ext_openai_endpoint(endpoint):
if not ":" in model:
model = model+":latest"
await token_queue.put((endpoint, model, prompt_tok, comp_tok))
yield b"data: [DONE]\n\n"
else:
prompt_tok = async_gen.usage.prompt_tokens or 0
@ -1706,7 +1715,7 @@ async def openai_completions_proxy(request: Request):
"seed": seed,
"stop": stop,
"stream": stream,
"stream_options": stream_options,
"stream_options": stream_options or {"include_usage": True },
"temperature": temperature,
"top_p": top_p,
"max_tokens": max_tokens,
@ -1734,7 +1743,7 @@ async def openai_completions_proxy(request: Request):
oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=config.api_keys[endpoint])
# 3. Async generator that streams completions data and decrements the counter
async def stream_ocompletions_response():
async def stream_ocompletions_response(model=model):
try:
# The chat method returns a generator of dicts (or GenerateResponse)
async_gen = await oclient.completions.create(**params)
@ -1745,7 +1754,17 @@ async def openai_completions_proxy(request: Request):
if hasattr(chunk, "model_dump_json")
else orjson.dumps(chunk)
)
if chunk.choices:
if chunk.choices[0].finish_reason == None:
yield f"data: {data}\n\n".encode("utf-8")
if chunk.usage is not None:
prompt_tok = chunk.usage.prompt_tokens or 0
comp_tok = chunk.usage.completion_tokens or 0
if prompt_tok != 0 or comp_tok != 0:
if not is_ext_openai_endpoint(endpoint):
if not ":" in model:
model = model+":latest"
await token_queue.put((endpoint, model, prompt_tok, comp_tok))
# Final DONE event
yield b"data: [DONE]\n\n"
else: