diff --git a/db.py b/db.py index af0f7e0..21b0b0a 100644 --- a/db.py +++ b/db.py @@ -1,6 +1,4 @@ -import aiosqlite -import os -import asyncio +import aiosqlite, os, asyncio from pathlib import Path from datetime import datetime, timezone from collections import defaultdict diff --git a/router.py b/router.py index fbc5b6e..46b6e7e 100644 --- a/router.py +++ b/router.py @@ -1596,7 +1596,7 @@ async def openai_chat_completions_proxy(request: Request): optional_params = { "tools": tools, "response_format": response_format, - "stream_options": stream_options, + "stream_options": stream_options or {"include_usage": True }, "max_completion_tokens": max_completion_tokens, "max_tokens": max_tokens, "temperature": temperature, @@ -1638,8 +1638,17 @@ async def openai_chat_completions_proxy(request: Request): if hasattr(chunk, "model_dump_json") else orjson.dumps(chunk) ) - if chunk.choices[0].delta.content is not None: - yield f"data: {data}\n\n".encode("utf-8") + if chunk.choices: + if chunk.choices[0].delta.content is not None: + yield f"data: {data}\n\n".encode("utf-8") + if chunk.usage is not None: + prompt_tok = chunk.usage.prompt_tokens or 0 + comp_tok = chunk.usage.completion_tokens or 0 + if prompt_tok != 0 or comp_tok != 0: + if not is_ext_openai_endpoint(endpoint): + if not ":" in model: + model = model+":latest" + await token_queue.put((endpoint, model, prompt_tok, comp_tok)) yield b"data: [DONE]\n\n" else: prompt_tok = async_gen.usage.prompt_tokens or 0 @@ -1706,7 +1715,7 @@ async def openai_completions_proxy(request: Request): "seed": seed, "stop": stop, "stream": stream, - "stream_options": stream_options, + "stream_options": stream_options or {"include_usage": True }, "temperature": temperature, "top_p": top_p, "max_tokens": max_tokens, @@ -1734,7 +1743,7 @@ async def openai_completions_proxy(request: Request): oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=config.api_keys[endpoint]) # 3. Async generator that streams completions data and decrements the counter - async def stream_ocompletions_response(): + async def stream_ocompletions_response(model=model): try: # The chat method returns a generator of dicts (or GenerateResponse) async_gen = await oclient.completions.create(**params) @@ -1745,7 +1754,17 @@ async def openai_completions_proxy(request: Request): if hasattr(chunk, "model_dump_json") else orjson.dumps(chunk) ) - yield f"data: {data}\n\n".encode("utf-8") + if chunk.choices: + if chunk.choices[0].finish_reason == None: + yield f"data: {data}\n\n".encode("utf-8") + if chunk.usage is not None: + prompt_tok = chunk.usage.prompt_tokens or 0 + comp_tok = chunk.usage.completion_tokens or 0 + if prompt_tok != 0 or comp_tok != 0: + if not is_ext_openai_endpoint(endpoint): + if not ":" in model: + model = model+":latest" + await token_queue.put((endpoint, model, prompt_tok, comp_tok)) # Final DONE event yield b"data: [DONE]\n\n" else: