fix(gateway): accept raw utf-8 text in text-load (#729)

Co-authored-by: nanqinhu <139929317+nanqinhu@users.noreply.github.com>
This commit is contained in:
CommitHu502Craft 2026-03-30 23:58:58 +08:00 committed by Cyber MacGeddon
parent 5a9db2da50
commit 7af1d60db8
4 changed files with 91 additions and 9 deletions

View file

@ -8,8 +8,7 @@ required:
properties:
text:
type: string
description: Text content (base64 encoded)
format: byte
description: Text content, either raw text or base64 encoded for compatibility with older clients
example: VGhpcyBpcyB0aGUgZG9jdW1lbnQgdGV4dC4uLg==
id:
type: string

View file

@ -8,7 +8,7 @@ post:
## Text Load Overview
Fire-and-forget document loading:
- **Input**: Text content (base64 encoded)
- **Input**: Text content (raw UTF-8 or base64 encoded)
- **Process**: Chunk, embed, store
- **Output**: None (202 Accepted)
@ -26,7 +26,14 @@ post:
## Text Format
Text must be base64 encoded:
Text may be sent as raw UTF-8 text:
```
{
"text": "Cancer survival: 2.74× higher hazard ratio"
}
```
Older clients may still send base64 encoded text:
```
text_content = "This is the document..."
encoded = base64.b64encode(text_content.encode('utf-8'))
@ -78,12 +85,12 @@ post:
simpleLoad:
summary: Load text document
value:
text: VGhpcyBpcyB0aGUgZG9jdW1lbnQgdGV4dC4uLg==
text: This is the document text...
id: doc-123
user: alice
collection: research
withMetadata:
summary: Load with RDF metadata
summary: Load with RDF metadata using base64 text
value:
text: UXVhbnR1bSBjb21wdXRpbmcgdXNlcyBxdWFudHVtIG1lY2hhbmljcyBwcmluY2lwbGVzLi4u
id: doc-456

View file

@ -0,0 +1,54 @@
"""
Unit tests for text document gateway translation compatibility.
"""
import base64
from trustgraph.messaging.translators.document_loading import TextDocumentTranslator
class TestTextDocumentTranslator:
def test_to_pulsar_decodes_base64_text(self):
translator = TextDocumentTranslator()
payload = "Cancer survival: 2.74× higher hazard ratio"
msg = translator.to_pulsar(
{
"id": "doc-1",
"user": "alice",
"collection": "research",
"charset": "utf-8",
"text": base64.b64encode(payload.encode("utf-8")).decode("ascii"),
}
)
assert msg.metadata.id == "doc-1"
assert msg.metadata.user == "alice"
assert msg.metadata.collection == "research"
assert msg.text == payload.encode("utf-8")
def test_to_pulsar_accepts_raw_utf8_text(self):
translator = TextDocumentTranslator()
payload = "Cancer survival: 2.74× higher hazard ratio"
msg = translator.to_pulsar(
{
"charset": "utf-8",
"text": payload,
}
)
assert msg.text == payload.encode("utf-8")
def test_to_pulsar_falls_back_to_raw_non_base64_ascii(self):
translator = TextDocumentTranslator()
payload = "plain-text payload"
msg = translator.to_pulsar(
{
"charset": "utf-8",
"text": payload,
}
)
assert msg.text == payload.encode("utf-8")

View file

@ -4,6 +4,29 @@ from ...schema import Document, TextDocument, Chunk, DocumentEmbeddings, ChunkEm
from .base import SendTranslator
def _decode_text_payload(payload: str | bytes, charset: str) -> str:
"""
Decode text-load payloads.
Historical clients send base64-encoded text, but direct REST callers may
send raw UTF-8 text. Support both so Unicode text-load requests do not fail
at the gateway translation layer.
"""
if isinstance(payload, bytes):
if not payload.isascii():
return payload.decode(charset)
candidate = payload.decode("ascii")
else:
if not payload.isascii():
return payload
candidate = payload
try:
return base64.b64decode(candidate, validate=True).decode(charset)
except (ValueError, UnicodeDecodeError):
return candidate
class DocumentTranslator(SendTranslator):
"""Translator for Document schema objects (PDF docs etc.)"""
@ -49,8 +72,7 @@ class TextDocumentTranslator(SendTranslator):
def to_pulsar(self, data: Dict[str, Any]) -> TextDocument:
charset = data.get("charset", "utf-8")
# Text is base64 encoded in input
text = base64.b64decode(data["text"]).decode(charset)
text = _decode_text_payload(data["text"], charset)
from ...schema import Metadata
return TextDocument(