diff --git a/specs/api/components/schemas/loading/TextLoadRequest.yaml b/specs/api/components/schemas/loading/TextLoadRequest.yaml index 4ded87d5..447308d4 100644 --- a/specs/api/components/schemas/loading/TextLoadRequest.yaml +++ b/specs/api/components/schemas/loading/TextLoadRequest.yaml @@ -8,8 +8,7 @@ required: properties: text: type: string - description: Text content (base64 encoded) - format: byte + description: Text content, either raw text or base64 encoded for compatibility with older clients example: VGhpcyBpcyB0aGUgZG9jdW1lbnQgdGV4dC4uLg== id: type: string diff --git a/specs/api/paths/flow/text-load.yaml b/specs/api/paths/flow/text-load.yaml index 5f918a3a..08bfe47b 100644 --- a/specs/api/paths/flow/text-load.yaml +++ b/specs/api/paths/flow/text-load.yaml @@ -8,7 +8,7 @@ post: ## Text Load Overview Fire-and-forget document loading: - - **Input**: Text content (base64 encoded) + - **Input**: Text content (raw UTF-8 or base64 encoded) - **Process**: Chunk, embed, store - **Output**: None (202 Accepted) @@ -26,7 +26,14 @@ post: ## Text Format - Text must be base64 encoded: + Text may be sent as raw UTF-8 text: + ``` + { + "text": "Cancer survival: 2.74× higher hazard ratio" + } + ``` + + Older clients may still send base64 encoded text: ``` text_content = "This is the document..." encoded = base64.b64encode(text_content.encode('utf-8')) @@ -78,12 +85,12 @@ post: simpleLoad: summary: Load text document value: - text: VGhpcyBpcyB0aGUgZG9jdW1lbnQgdGV4dC4uLg== + text: This is the document text... id: doc-123 user: alice collection: research withMetadata: - summary: Load with RDF metadata + summary: Load with RDF metadata using base64 text value: text: UXVhbnR1bSBjb21wdXRpbmcgdXNlcyBxdWFudHVtIG1lY2hhbmljcyBwcmluY2lwbGVzLi4u id: doc-456 diff --git a/tests/unit/test_gateway/test_text_document_translator.py b/tests/unit/test_gateway/test_text_document_translator.py new file mode 100644 index 00000000..f836eb2b --- /dev/null +++ b/tests/unit/test_gateway/test_text_document_translator.py @@ -0,0 +1,54 @@ +""" +Unit tests for text document gateway translation compatibility. +""" + +import base64 + +from trustgraph.messaging.translators.document_loading import TextDocumentTranslator + + +class TestTextDocumentTranslator: + def test_to_pulsar_decodes_base64_text(self): + translator = TextDocumentTranslator() + payload = "Cancer survival: 2.74× higher hazard ratio" + + msg = translator.to_pulsar( + { + "id": "doc-1", + "user": "alice", + "collection": "research", + "charset": "utf-8", + "text": base64.b64encode(payload.encode("utf-8")).decode("ascii"), + } + ) + + assert msg.metadata.id == "doc-1" + assert msg.metadata.user == "alice" + assert msg.metadata.collection == "research" + assert msg.text == payload.encode("utf-8") + + def test_to_pulsar_accepts_raw_utf8_text(self): + translator = TextDocumentTranslator() + payload = "Cancer survival: 2.74× higher hazard ratio" + + msg = translator.to_pulsar( + { + "charset": "utf-8", + "text": payload, + } + ) + + assert msg.text == payload.encode("utf-8") + + def test_to_pulsar_falls_back_to_raw_non_base64_ascii(self): + translator = TextDocumentTranslator() + payload = "plain-text payload" + + msg = translator.to_pulsar( + { + "charset": "utf-8", + "text": payload, + } + ) + + assert msg.text == payload.encode("utf-8") diff --git a/trustgraph-base/trustgraph/messaging/translators/document_loading.py b/trustgraph-base/trustgraph/messaging/translators/document_loading.py index 7c2a013f..51cda697 100644 --- a/trustgraph-base/trustgraph/messaging/translators/document_loading.py +++ b/trustgraph-base/trustgraph/messaging/translators/document_loading.py @@ -4,6 +4,29 @@ from ...schema import Document, TextDocument, Chunk, DocumentEmbeddings, ChunkEm from .base import SendTranslator +def _decode_text_payload(payload: str | bytes, charset: str) -> str: + """ + Decode text-load payloads. + + Historical clients send base64-encoded text, but direct REST callers may + send raw UTF-8 text. Support both so Unicode text-load requests do not fail + at the gateway translation layer. + """ + if isinstance(payload, bytes): + if not payload.isascii(): + return payload.decode(charset) + candidate = payload.decode("ascii") + else: + if not payload.isascii(): + return payload + candidate = payload + + try: + return base64.b64decode(candidate, validate=True).decode(charset) + except (ValueError, UnicodeDecodeError): + return candidate + + class DocumentTranslator(SendTranslator): """Translator for Document schema objects (PDF docs etc.)""" @@ -49,8 +72,7 @@ class TextDocumentTranslator(SendTranslator): def to_pulsar(self, data: Dict[str, Any]) -> TextDocument: charset = data.get("charset", "utf-8") - # Text is base64 encoded in input - text = base64.b64decode(data["text"]).decode(charset) + text = _decode_text_payload(data["text"], charset) from ...schema import Metadata return TextDocument( @@ -169,4 +191,4 @@ class DocumentEmbeddingsTranslator(SendTranslator): result["metadata"] = metadata_dict - return result \ No newline at end of file + return result