fix(/v1/embeddings): returning the async_gen forced FastAPI serialization which caused Pydantic Errors. Also sanizted nan/inf values to floats (0.0).

Use try - finally to properly decrement usage counters in case of error.
2026-02-27 16:39:27 +01:00 · 2026-02-27 16:39:27 +01:00 · ad4a1d07b2
commit ad4a1d07b2
parent d2ea65f74a
1 changed files with 12 additions and 9 deletions
--- a/router.py
+++ b/router.py
@ -2,11 +2,11 @@
 title: NOMYO Router - an Ollama Proxy with Endpoint:Model aware routing
 author: alpha-nerd-nomyo
 author_url: https://github.com/nomyo-ai
-version: 0.6
+version: 0.7
 license: AGPL
 """
 # -------------------------------------------------------------
-import orjson, time, asyncio, yaml, ollama, openai, os, re, aiohttp, ssl, random, base64, io, enhance, secrets
+import orjson, time, asyncio, yaml, ollama, openai, os, re, aiohttp, ssl, random, base64, io, enhance, secrets, math
 try:
    import truststore; truststore.inject_into_ssl()
 except ImportError:
@ -2637,14 +2637,17 @@ async def openai_embedding_proxy(request: Request):
    oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=api_key)
-    # 3. Async generator that streams embedding data and decrements the counter
+    try:
        async_gen = await oclient.embeddings.create(input=doc, model=model)
-
+        result = async_gen.model_dump()
        for item in result.get("data", []):
            emb = item.get("embedding")
            if emb:
                item["embedding"] = [0.0 if isinstance(v, float) and not math.isfinite(v) else v for v in emb]
        return JSONResponse(content=result)
    finally:
        await decrement_usage(endpoint, tracking_model)
    # 5. Return a StreamingResponse backed by the generator
    return async_gen
 # -------------------------------------------------------------
 # 22. API route – OpenAI compatible Chat Completions
 # -------------------------------------------------------------