adding usage metrics to /v1 endpoints if stream == True
This commit is contained in:
parent
45d1d442ee
commit
7b50a5a299
2 changed files with 26 additions and 9 deletions
4
db.py
4
db.py
|
|
@ -1,6 +1,4 @@
|
||||||
import aiosqlite
|
import aiosqlite, os, asyncio
|
||||||
import os
|
|
||||||
import asyncio
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
|
||||||
31
router.py
31
router.py
|
|
@ -1596,7 +1596,7 @@ async def openai_chat_completions_proxy(request: Request):
|
||||||
optional_params = {
|
optional_params = {
|
||||||
"tools": tools,
|
"tools": tools,
|
||||||
"response_format": response_format,
|
"response_format": response_format,
|
||||||
"stream_options": stream_options,
|
"stream_options": stream_options or {"include_usage": True },
|
||||||
"max_completion_tokens": max_completion_tokens,
|
"max_completion_tokens": max_completion_tokens,
|
||||||
"max_tokens": max_tokens,
|
"max_tokens": max_tokens,
|
||||||
"temperature": temperature,
|
"temperature": temperature,
|
||||||
|
|
@ -1638,8 +1638,17 @@ async def openai_chat_completions_proxy(request: Request):
|
||||||
if hasattr(chunk, "model_dump_json")
|
if hasattr(chunk, "model_dump_json")
|
||||||
else orjson.dumps(chunk)
|
else orjson.dumps(chunk)
|
||||||
)
|
)
|
||||||
if chunk.choices[0].delta.content is not None:
|
if chunk.choices:
|
||||||
yield f"data: {data}\n\n".encode("utf-8")
|
if chunk.choices[0].delta.content is not None:
|
||||||
|
yield f"data: {data}\n\n".encode("utf-8")
|
||||||
|
if chunk.usage is not None:
|
||||||
|
prompt_tok = chunk.usage.prompt_tokens or 0
|
||||||
|
comp_tok = chunk.usage.completion_tokens or 0
|
||||||
|
if prompt_tok != 0 or comp_tok != 0:
|
||||||
|
if not is_ext_openai_endpoint(endpoint):
|
||||||
|
if not ":" in model:
|
||||||
|
model = model+":latest"
|
||||||
|
await token_queue.put((endpoint, model, prompt_tok, comp_tok))
|
||||||
yield b"data: [DONE]\n\n"
|
yield b"data: [DONE]\n\n"
|
||||||
else:
|
else:
|
||||||
prompt_tok = async_gen.usage.prompt_tokens or 0
|
prompt_tok = async_gen.usage.prompt_tokens or 0
|
||||||
|
|
@ -1706,7 +1715,7 @@ async def openai_completions_proxy(request: Request):
|
||||||
"seed": seed,
|
"seed": seed,
|
||||||
"stop": stop,
|
"stop": stop,
|
||||||
"stream": stream,
|
"stream": stream,
|
||||||
"stream_options": stream_options,
|
"stream_options": stream_options or {"include_usage": True },
|
||||||
"temperature": temperature,
|
"temperature": temperature,
|
||||||
"top_p": top_p,
|
"top_p": top_p,
|
||||||
"max_tokens": max_tokens,
|
"max_tokens": max_tokens,
|
||||||
|
|
@ -1734,7 +1743,7 @@ async def openai_completions_proxy(request: Request):
|
||||||
oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=config.api_keys[endpoint])
|
oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=config.api_keys[endpoint])
|
||||||
|
|
||||||
# 3. Async generator that streams completions data and decrements the counter
|
# 3. Async generator that streams completions data and decrements the counter
|
||||||
async def stream_ocompletions_response():
|
async def stream_ocompletions_response(model=model):
|
||||||
try:
|
try:
|
||||||
# The chat method returns a generator of dicts (or GenerateResponse)
|
# The chat method returns a generator of dicts (or GenerateResponse)
|
||||||
async_gen = await oclient.completions.create(**params)
|
async_gen = await oclient.completions.create(**params)
|
||||||
|
|
@ -1745,7 +1754,17 @@ async def openai_completions_proxy(request: Request):
|
||||||
if hasattr(chunk, "model_dump_json")
|
if hasattr(chunk, "model_dump_json")
|
||||||
else orjson.dumps(chunk)
|
else orjson.dumps(chunk)
|
||||||
)
|
)
|
||||||
yield f"data: {data}\n\n".encode("utf-8")
|
if chunk.choices:
|
||||||
|
if chunk.choices[0].finish_reason == None:
|
||||||
|
yield f"data: {data}\n\n".encode("utf-8")
|
||||||
|
if chunk.usage is not None:
|
||||||
|
prompt_tok = chunk.usage.prompt_tokens or 0
|
||||||
|
comp_tok = chunk.usage.completion_tokens or 0
|
||||||
|
if prompt_tok != 0 or comp_tok != 0:
|
||||||
|
if not is_ext_openai_endpoint(endpoint):
|
||||||
|
if not ":" in model:
|
||||||
|
model = model+":latest"
|
||||||
|
await token_queue.put((endpoint, model, prompt_tok, comp_tok))
|
||||||
# Final DONE event
|
# Final DONE event
|
||||||
yield b"data: [DONE]\n\n"
|
yield b"data: [DONE]\n\n"
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue