fix(gateway): accept raw utf-8 text in text-load (#729)

Co-authored-by: nanqinhu <139929317+nanqinhu@users.noreply.github.com>
This commit is contained in:
CommitHu502Craft 2026-03-30 23:58:58 +08:00 committed by Cyber MacGeddon
parent 5a9db2da50
commit 7af1d60db8
4 changed files with 91 additions and 9 deletions

View file

@ -0,0 +1,54 @@
"""
Unit tests for text document gateway translation compatibility.
"""
import base64
from trustgraph.messaging.translators.document_loading import TextDocumentTranslator
class TestTextDocumentTranslator:
def test_to_pulsar_decodes_base64_text(self):
translator = TextDocumentTranslator()
payload = "Cancer survival: 2.74× higher hazard ratio"
msg = translator.to_pulsar(
{
"id": "doc-1",
"user": "alice",
"collection": "research",
"charset": "utf-8",
"text": base64.b64encode(payload.encode("utf-8")).decode("ascii"),
}
)
assert msg.metadata.id == "doc-1"
assert msg.metadata.user == "alice"
assert msg.metadata.collection == "research"
assert msg.text == payload.encode("utf-8")
def test_to_pulsar_accepts_raw_utf8_text(self):
translator = TextDocumentTranslator()
payload = "Cancer survival: 2.74× higher hazard ratio"
msg = translator.to_pulsar(
{
"charset": "utf-8",
"text": payload,
}
)
assert msg.text == payload.encode("utf-8")
def test_to_pulsar_falls_back_to_raw_non_base64_ascii(self):
translator = TextDocumentTranslator()
payload = "plain-text payload"
msg = translator.to_pulsar(
{
"charset": "utf-8",
"text": payload,
}
)
assert msg.text == payload.encode("utf-8")