fix(gateway): accept raw utf-8 text in text-load (#729)

Co-authored-by: nanqinhu <139929317+nanqinhu@users.noreply.github.com>
This commit is contained in:
CommitHu502Craft 2026-03-30 23:58:58 +08:00 committed by Cyber MacGeddon
parent 5a9db2da50
commit 7af1d60db8
4 changed files with 91 additions and 9 deletions

View file

@ -4,6 +4,29 @@ from ...schema import Document, TextDocument, Chunk, DocumentEmbeddings, ChunkEm
from .base import SendTranslator
def _decode_text_payload(payload: str | bytes, charset: str) -> str:
"""
Decode text-load payloads.
Historical clients send base64-encoded text, but direct REST callers may
send raw UTF-8 text. Support both so Unicode text-load requests do not fail
at the gateway translation layer.
"""
if isinstance(payload, bytes):
if not payload.isascii():
return payload.decode(charset)
candidate = payload.decode("ascii")
else:
if not payload.isascii():
return payload
candidate = payload
try:
return base64.b64decode(candidate, validate=True).decode(charset)
except (ValueError, UnicodeDecodeError):
return candidate
class DocumentTranslator(SendTranslator):
"""Translator for Document schema objects (PDF docs etc.)"""
@ -49,8 +72,7 @@ class TextDocumentTranslator(SendTranslator):
def to_pulsar(self, data: Dict[str, Any]) -> TextDocument:
charset = data.get("charset", "utf-8")
# Text is base64 encoded in input
text = base64.b64decode(data["text"]).decode(charset)
text = _decode_text_payload(data["text"], charset)
from ...schema import Metadata
return TextDocument(
@ -169,4 +191,4 @@ class DocumentEmbeddingsTranslator(SendTranslator):
result["metadata"] = metadata_dict
return result
return result