Merge pull request #6 from nomyo-ai/dev-v0.3.x
This is adding quite a few improvements, fixes and already preparations for v0.4 increasing compatibility and stability and even performance.
This commit is contained in:
commit
ffee2baab8
2 changed files with 76 additions and 52 deletions
126
router.py
126
router.py
|
|
@ -100,6 +100,11 @@ app.add_middleware(
|
||||||
allow_methods=["GET", "POST", "DELETE"],
|
allow_methods=["GET", "POST", "DELETE"],
|
||||||
allow_headers=["Authorization", "Content-Type"],
|
allow_headers=["Authorization", "Content-Type"],
|
||||||
)
|
)
|
||||||
|
default_headers={
|
||||||
|
"HTTP-Referer": "https://nomyo.ai",
|
||||||
|
"X-Title": "NOMYO Router",
|
||||||
|
}
|
||||||
|
|
||||||
# -------------------------------------------------------------
|
# -------------------------------------------------------------
|
||||||
# 3. Global state: per‑endpoint per‑model active connection counters
|
# 3. Global state: per‑endpoint per‑model active connection counters
|
||||||
# -------------------------------------------------------------
|
# -------------------------------------------------------------
|
||||||
|
|
@ -271,68 +276,83 @@ def iso8601_ns():
|
||||||
|
|
||||||
class rechunk:
|
class rechunk:
|
||||||
def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float):
|
def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float):
|
||||||
rechunk = { "model": chunk.model,
|
|
||||||
"created_at": iso8601_ns() ,
|
|
||||||
"done_reason": chunk.choices[0].finish_reason,
|
|
||||||
"load_duration": None,
|
|
||||||
"prompt_eval_count": None,
|
|
||||||
"prompt_eval_duration": None,
|
|
||||||
"eval_count": None,
|
|
||||||
"eval_duration": None,
|
|
||||||
"eval_count": (chunk.usage.completion_tokens if chunk.usage is not None else None),
|
|
||||||
"prompt_eval_count": (chunk.usage.prompt_tokens if chunk.usage is not None else None),
|
|
||||||
"eval_duration": (int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None),
|
|
||||||
"response_token/s": (round(chunk.usage.total_tokens / (time.perf_counter() - start_ts), 2) if chunk.usage is not None else None)
|
|
||||||
}
|
|
||||||
if stream == True:
|
if stream == True:
|
||||||
rechunk["message"] = {"role": chunk.choices[0].delta.role or "assistant", "content": chunk.choices[0].delta.content, "thinking": None, "images": None, "tool_name": None, "tool_calls": None}
|
assistant_msg = ollama.Message(
|
||||||
|
role=chunk.choices[0].delta.role or "assistant",
|
||||||
|
content=chunk.choices[0].delta.content,
|
||||||
|
thinking=None,
|
||||||
|
images=None,
|
||||||
|
tool_name=None,
|
||||||
|
tool_calls=None
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
rechunk["message"] = {"role": chunk.choices[0].message.role or "assistant", "content": chunk.choices[0].message.content, "thinking": None, "images": None, "tool_name": None, "tool_calls": None}
|
assistant_msg = ollama.Message(
|
||||||
|
role=chunk.choices[0].message.role or "assistant",
|
||||||
|
content=chunk.choices[0].message.content,
|
||||||
|
thinking=None,
|
||||||
|
images=None,
|
||||||
|
tool_name=None,
|
||||||
|
tool_calls=None
|
||||||
|
)
|
||||||
|
rechunk = ollama.ChatResponse(model=chunk.model,
|
||||||
|
created_at=iso8601_ns(),
|
||||||
|
done_reason=chunk.choices[0].finish_reason,
|
||||||
|
load_duration=100000,
|
||||||
|
prompt_eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)) if chunk.usage is not None else None),
|
||||||
|
eval_count= (chunk.usage.completion_tokens if chunk.usage is not None else None),
|
||||||
|
prompt_eval_count=(chunk.usage.prompt_tokens if chunk.usage is not None else None),
|
||||||
|
eval_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None),
|
||||||
|
total_duration=(int((time.perf_counter() - start_ts) * 1_000_000_000) if chunk.usage is not None else None),
|
||||||
|
message=assistant_msg)
|
||||||
return rechunk
|
return rechunk
|
||||||
|
|
||||||
def openai_completion2ollama(chunk: dict, stream: bool, start_ts: float):
|
def openai_completion2ollama(chunk: dict, stream: bool, start_ts: float):
|
||||||
with_thinking = chunk.choices[0] if chunk.choices[0] else None
|
with_thinking = chunk.choices[0] if chunk.choices[0] else None
|
||||||
thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None
|
thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None
|
||||||
rechunk = { "model": chunk.model,
|
rechunk = ollama.GenerateResponse(model=chunk.model,
|
||||||
"created_at": iso8601_ns(),
|
created_at=iso8601_ns(),
|
||||||
"load_duration": None,
|
load_duration=10000,
|
||||||
"done_reason": chunk.choices[0].finish_reason,
|
done_reason=chunk.choices[0].finish_reason,
|
||||||
"total_duration": None,
|
done=None, #True if chunk.choices[0].finish_reason is not None else False,
|
||||||
"eval_duration": (int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None),
|
total_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None),
|
||||||
"thinking": thinking,
|
eval_duration=(int((time.perf_counter() - start_ts) * 1000) if chunk.usage is not None else None),
|
||||||
"context": None,
|
thinking=thinking,
|
||||||
"response": chunk.choices[0].text
|
response=chunk.choices[0].text
|
||||||
}
|
)
|
||||||
return rechunk
|
return rechunk
|
||||||
|
|
||||||
def openai_embeddings2ollama(chunk: dict):
|
def openai_embeddings2ollama(chunk: dict):
|
||||||
rechunk = {"embedding": chunk.data[0].embedding}
|
rechunk = ollama.EmbeddingsResponse(embedding=chunk.data[0].embedding)
|
||||||
return rechunk
|
return rechunk
|
||||||
|
|
||||||
def openai_embed2ollama(chunk: dict, model: str):
|
def openai_embed2ollama(chunk: dict, model: str):
|
||||||
rechunk = { "model": model,
|
rechunk = ollama.EmbedResponse(model=model,
|
||||||
"created_at": iso8601_ns(),
|
created_at=iso8601_ns(),
|
||||||
"done": None,
|
done=None,
|
||||||
"done_reason": None,
|
done_reason=None,
|
||||||
"total_duration": None,
|
total_duration=None,
|
||||||
"load_duration": None,
|
load_duration=None,
|
||||||
"prompt_eval_count": None,
|
prompt_eval_count=None,
|
||||||
"prompt_eval_duration": None,
|
prompt_eval_duration=None,
|
||||||
"eval_count": None,
|
eval_count=None,
|
||||||
"eval_duration": None,
|
eval_duration=None,
|
||||||
"embeddings": [chunk.data[0].embedding]
|
embeddings=[chunk.data[0].embedding]
|
||||||
}
|
)
|
||||||
return rechunk
|
return rechunk
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# SSE Helpser
|
# SSE Helpser
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
async def publish_snapshot():
|
async def publish_snapshot():
|
||||||
snapshot = json.dumps({"usage_counts": usage_counts})
|
async with usage_lock:
|
||||||
|
snapshot = json.dumps({"usage_counts": usage_counts}, sort_keys=True)
|
||||||
async with _subscribers_lock:
|
async with _subscribers_lock:
|
||||||
for q in _subscribers:
|
for q in _subscribers:
|
||||||
# If the queue is full, drop the message to avoid back‑pressure.
|
# If the queue is full, drop the message to avoid back‑pressure.
|
||||||
if q.full():
|
if q.full():
|
||||||
continue
|
try:
|
||||||
|
await q.get()
|
||||||
|
except asyncio.QueueEmpty:
|
||||||
|
pass
|
||||||
await q.put(snapshot)
|
await q.put(snapshot)
|
||||||
|
|
||||||
async def close_all_sse_queues():
|
async def close_all_sse_queues():
|
||||||
|
|
@ -497,7 +517,7 @@ async def proxy(request: Request):
|
||||||
}
|
}
|
||||||
|
|
||||||
params.update({k: v for k, v in optional_params.items() if v is not None})
|
params.update({k: v for k, v in optional_params.items() if v is not None})
|
||||||
oclient = openai.AsyncOpenAI(base_url=endpoint, api_key=config.api_keys[endpoint])
|
oclient = openai.AsyncOpenAI(base_url=endpoint, default_headers=default_headers, api_key=config.api_keys[endpoint])
|
||||||
else:
|
else:
|
||||||
client = ollama.AsyncClient(host=endpoint)
|
client = ollama.AsyncClient(host=endpoint)
|
||||||
await increment_usage(endpoint, model)
|
await increment_usage(endpoint, model)
|
||||||
|
|
@ -560,9 +580,10 @@ async def chat_proxy(request: Request):
|
||||||
tools = payload.get("tools")
|
tools = payload.get("tools")
|
||||||
stream = payload.get("stream")
|
stream = payload.get("stream")
|
||||||
think = payload.get("think")
|
think = payload.get("think")
|
||||||
format = payload.get("format")
|
_format = payload.get("format")
|
||||||
options = payload.get("options")
|
options = payload.get("options")
|
||||||
keep_alive = payload.get("keep_alive")
|
keep_alive = payload.get("keep_alive")
|
||||||
|
options = payload.get("options")
|
||||||
|
|
||||||
if not model:
|
if not model:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
|
|
@ -572,6 +593,10 @@ async def chat_proxy(request: Request):
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=400, detail="Missing or invalid 'messages' field (must be a list)"
|
status_code=400, detail="Missing or invalid 'messages' field (must be a list)"
|
||||||
)
|
)
|
||||||
|
if options is not None and not isinstance(options, dict):
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400, detail="`options` must be a JSON object"
|
||||||
|
)
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
|
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") from e
|
||||||
|
|
||||||
|
|
@ -591,9 +616,8 @@ async def chat_proxy(request: Request):
|
||||||
"tools": tools,
|
"tools": tools,
|
||||||
"stream": stream,
|
"stream": stream,
|
||||||
}
|
}
|
||||||
|
|
||||||
params.update({k: v for k, v in optional_params.items() if v is not None})
|
params.update({k: v for k, v in optional_params.items() if v is not None})
|
||||||
oclient = openai.AsyncOpenAI(base_url=endpoint, api_key=config.api_keys[endpoint])
|
oclient = openai.AsyncOpenAI(base_url=endpoint, default_headers=default_headers, api_key=config.api_keys[endpoint])
|
||||||
else:
|
else:
|
||||||
client = ollama.AsyncClient(host=endpoint)
|
client = ollama.AsyncClient(host=endpoint)
|
||||||
await increment_usage(endpoint, model)
|
await increment_usage(endpoint, model)
|
||||||
|
|
@ -605,7 +629,7 @@ async def chat_proxy(request: Request):
|
||||||
start_ts = time.perf_counter()
|
start_ts = time.perf_counter()
|
||||||
async_gen = await oclient.chat.completions.create(**params)
|
async_gen = await oclient.chat.completions.create(**params)
|
||||||
else:
|
else:
|
||||||
async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=format, options=options, keep_alive=keep_alive)
|
async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive)
|
||||||
if stream == True:
|
if stream == True:
|
||||||
async for chunk in async_gen:
|
async for chunk in async_gen:
|
||||||
if is_openai_endpoint:
|
if is_openai_endpoint:
|
||||||
|
|
@ -685,7 +709,7 @@ async def embedding_proxy(request: Request):
|
||||||
try:
|
try:
|
||||||
# The chat method returns a generator of dicts (or GenerateResponse)
|
# The chat method returns a generator of dicts (or GenerateResponse)
|
||||||
if is_openai_endpoint:
|
if is_openai_endpoint:
|
||||||
async_gen = await client.embeddings.create(input=[prompt], model=model)
|
async_gen = await client.embeddings.create(input=prompt, model=model)
|
||||||
async_gen = rechunk.openai_embeddings2ollama(async_gen)
|
async_gen = rechunk.openai_embeddings2ollama(async_gen)
|
||||||
else:
|
else:
|
||||||
async_gen = await client.embeddings(model=model, prompt=prompt, options=options, keep_alive=keep_alive)
|
async_gen = await client.embeddings(model=model, prompt=prompt, options=options, keep_alive=keep_alive)
|
||||||
|
|
@ -751,7 +775,7 @@ async def embed_proxy(request: Request):
|
||||||
try:
|
try:
|
||||||
# The chat method returns a generator of dicts (or GenerateResponse)
|
# The chat method returns a generator of dicts (or GenerateResponse)
|
||||||
if is_openai_endpoint:
|
if is_openai_endpoint:
|
||||||
async_gen = await client.embeddings.create(input=[_input], model=model)
|
async_gen = await client.embeddings.create(input=_input, model=model)
|
||||||
async_gen = rechunk.openai_embed2ollama(async_gen, model)
|
async_gen = rechunk.openai_embed2ollama(async_gen, model)
|
||||||
else:
|
else:
|
||||||
async_gen = await client.embed(model=model, input=_input, truncate=truncate, options=options, keep_alive=keep_alive)
|
async_gen = await client.embed(model=model, input=_input, truncate=truncate, options=options, keep_alive=keep_alive)
|
||||||
|
|
@ -1176,10 +1200,10 @@ async def openai_embedding_proxy(request: Request):
|
||||||
else:
|
else:
|
||||||
api_key = "ollama"
|
api_key = "ollama"
|
||||||
base_url = ep2base(endpoint)
|
base_url = ep2base(endpoint)
|
||||||
oclient = openai.AsyncOpenAI(base_url=base_url, api_key=api_key)
|
oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=api_key)
|
||||||
|
|
||||||
# 3. Async generator that streams embedding data and decrements the counter
|
# 3. Async generator that streams embedding data and decrements the counter
|
||||||
async_gen = await oclient.embeddings.create(input=[doc], model=model)
|
async_gen = await oclient.embeddings.create(input=doc, model=model)
|
||||||
|
|
||||||
await decrement_usage(endpoint, model)
|
await decrement_usage(endpoint, model)
|
||||||
|
|
||||||
|
|
@ -1256,7 +1280,7 @@ async def openai_chat_completions_proxy(request: Request):
|
||||||
endpoint = await choose_endpoint(model)
|
endpoint = await choose_endpoint(model)
|
||||||
await increment_usage(endpoint, model)
|
await increment_usage(endpoint, model)
|
||||||
base_url = ep2base(endpoint)
|
base_url = ep2base(endpoint)
|
||||||
oclient = openai.AsyncOpenAI(base_url=base_url, api_key=config.api_keys[endpoint])
|
oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=config.api_keys[endpoint])
|
||||||
|
|
||||||
# 3. Async generator that streams completions data and decrements the counter
|
# 3. Async generator that streams completions data and decrements the counter
|
||||||
async def stream_ochat_response():
|
async def stream_ochat_response():
|
||||||
|
|
@ -1359,7 +1383,7 @@ async def openai_completions_proxy(request: Request):
|
||||||
endpoint = await choose_endpoint(model)
|
endpoint = await choose_endpoint(model)
|
||||||
await increment_usage(endpoint, model)
|
await increment_usage(endpoint, model)
|
||||||
base_url = ep2base(endpoint)
|
base_url = ep2base(endpoint)
|
||||||
oclient = openai.AsyncOpenAI(base_url=base_url, api_key=config.api_keys[endpoint])
|
oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=config.api_keys[endpoint])
|
||||||
|
|
||||||
# 3. Async generator that streams completions data and decrements the counter
|
# 3. Async generator that streams completions data and decrements the counter
|
||||||
async def stream_ocompletions_response():
|
async def stream_ocompletions_response():
|
||||||
|
|
|
||||||
|
|
@ -346,7 +346,7 @@
|
||||||
const body = document.getElementById("tags-body");
|
const body = document.getElementById("tags-body");
|
||||||
body.innerHTML = data.models
|
body.innerHTML = data.models
|
||||||
.map((m) => {
|
.map((m) => {
|
||||||
let modelCell = `${m.id || m.name}`;
|
let modelCell = `${m.model}`;
|
||||||
if (m.digest) {
|
if (m.digest) {
|
||||||
modelCell += `<a href="#" class="delete-link" data-model="${m.name}">delete</a>`;
|
modelCell += `<a href="#" class="delete-link" data-model="${m.name}">delete</a>`;
|
||||||
modelCell += `<a href="#" class="copy-link" data-source="${m.name}">copy</a>`;
|
modelCell += `<a href="#" class="copy-link" data-source="${m.name}">copy</a>`;
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue