fix(router): normalize multimodal input to extract text for embeddings

Extract text parts from multimodal payloads (lists/dicts).
Skip image_url and other non-text types to ensure embedding
models receive compatible text-only input.
This commit is contained in:
Alpha Nerd 2026-03-11 16:41:21 +01:00
parent 46da392a53
commit 1ae989788b

View file

@ -2686,6 +2686,21 @@ async def openai_embedding_proxy(request: Request):
model = payload.get("model")
doc = payload.get("input")
# Normalize multimodal input: extract only text parts for embedding models
if isinstance(doc, list):
normalized = []
for item in doc:
if isinstance(item, dict):
# Multimodal content part - extract text only, skip images
if item.get("type") == "text":
normalized.append(item.get("text", ""))
# Skip image_url and other non-text types
else:
normalized.append(item)
doc = normalized if len(normalized) != 1 else normalized[0]
elif isinstance(doc, dict) and doc.get("type") == "text":
doc = doc.get("text", "")
if not model:
raise HTTPException(
status_code=400, detail="Missing required field 'model'"