mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
fix(gateway): accept raw utf-8 text in text-load (#729)
Co-authored-by: nanqinhu <139929317+nanqinhu@users.noreply.github.com>
This commit is contained in:
parent
5a9db2da50
commit
7af1d60db8
4 changed files with 91 additions and 9 deletions
|
|
@ -8,8 +8,7 @@ required:
|
|||
properties:
|
||||
text:
|
||||
type: string
|
||||
description: Text content (base64 encoded)
|
||||
format: byte
|
||||
description: Text content, either raw text or base64 encoded for compatibility with older clients
|
||||
example: VGhpcyBpcyB0aGUgZG9jdW1lbnQgdGV4dC4uLg==
|
||||
id:
|
||||
type: string
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ post:
|
|||
## Text Load Overview
|
||||
|
||||
Fire-and-forget document loading:
|
||||
- **Input**: Text content (base64 encoded)
|
||||
- **Input**: Text content (raw UTF-8 or base64 encoded)
|
||||
- **Process**: Chunk, embed, store
|
||||
- **Output**: None (202 Accepted)
|
||||
|
||||
|
|
@ -26,7 +26,14 @@ post:
|
|||
|
||||
## Text Format
|
||||
|
||||
Text must be base64 encoded:
|
||||
Text may be sent as raw UTF-8 text:
|
||||
```
|
||||
{
|
||||
"text": "Cancer survival: 2.74× higher hazard ratio"
|
||||
}
|
||||
```
|
||||
|
||||
Older clients may still send base64 encoded text:
|
||||
```
|
||||
text_content = "This is the document..."
|
||||
encoded = base64.b64encode(text_content.encode('utf-8'))
|
||||
|
|
@ -78,12 +85,12 @@ post:
|
|||
simpleLoad:
|
||||
summary: Load text document
|
||||
value:
|
||||
text: VGhpcyBpcyB0aGUgZG9jdW1lbnQgdGV4dC4uLg==
|
||||
text: This is the document text...
|
||||
id: doc-123
|
||||
user: alice
|
||||
collection: research
|
||||
withMetadata:
|
||||
summary: Load with RDF metadata
|
||||
summary: Load with RDF metadata using base64 text
|
||||
value:
|
||||
text: UXVhbnR1bSBjb21wdXRpbmcgdXNlcyBxdWFudHVtIG1lY2hhbmljcyBwcmluY2lwbGVzLi4u
|
||||
id: doc-456
|
||||
|
|
|
|||
54
tests/unit/test_gateway/test_text_document_translator.py
Normal file
54
tests/unit/test_gateway/test_text_document_translator.py
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
"""
|
||||
Unit tests for text document gateway translation compatibility.
|
||||
"""
|
||||
|
||||
import base64
|
||||
|
||||
from trustgraph.messaging.translators.document_loading import TextDocumentTranslator
|
||||
|
||||
|
||||
class TestTextDocumentTranslator:
|
||||
def test_to_pulsar_decodes_base64_text(self):
|
||||
translator = TextDocumentTranslator()
|
||||
payload = "Cancer survival: 2.74× higher hazard ratio"
|
||||
|
||||
msg = translator.to_pulsar(
|
||||
{
|
||||
"id": "doc-1",
|
||||
"user": "alice",
|
||||
"collection": "research",
|
||||
"charset": "utf-8",
|
||||
"text": base64.b64encode(payload.encode("utf-8")).decode("ascii"),
|
||||
}
|
||||
)
|
||||
|
||||
assert msg.metadata.id == "doc-1"
|
||||
assert msg.metadata.user == "alice"
|
||||
assert msg.metadata.collection == "research"
|
||||
assert msg.text == payload.encode("utf-8")
|
||||
|
||||
def test_to_pulsar_accepts_raw_utf8_text(self):
|
||||
translator = TextDocumentTranslator()
|
||||
payload = "Cancer survival: 2.74× higher hazard ratio"
|
||||
|
||||
msg = translator.to_pulsar(
|
||||
{
|
||||
"charset": "utf-8",
|
||||
"text": payload,
|
||||
}
|
||||
)
|
||||
|
||||
assert msg.text == payload.encode("utf-8")
|
||||
|
||||
def test_to_pulsar_falls_back_to_raw_non_base64_ascii(self):
|
||||
translator = TextDocumentTranslator()
|
||||
payload = "plain-text payload"
|
||||
|
||||
msg = translator.to_pulsar(
|
||||
{
|
||||
"charset": "utf-8",
|
||||
"text": payload,
|
||||
}
|
||||
)
|
||||
|
||||
assert msg.text == payload.encode("utf-8")
|
||||
|
|
@ -4,6 +4,29 @@ from ...schema import Document, TextDocument, Chunk, DocumentEmbeddings, ChunkEm
|
|||
from .base import SendTranslator
|
||||
|
||||
|
||||
def _decode_text_payload(payload: str | bytes, charset: str) -> str:
|
||||
"""
|
||||
Decode text-load payloads.
|
||||
|
||||
Historical clients send base64-encoded text, but direct REST callers may
|
||||
send raw UTF-8 text. Support both so Unicode text-load requests do not fail
|
||||
at the gateway translation layer.
|
||||
"""
|
||||
if isinstance(payload, bytes):
|
||||
if not payload.isascii():
|
||||
return payload.decode(charset)
|
||||
candidate = payload.decode("ascii")
|
||||
else:
|
||||
if not payload.isascii():
|
||||
return payload
|
||||
candidate = payload
|
||||
|
||||
try:
|
||||
return base64.b64decode(candidate, validate=True).decode(charset)
|
||||
except (ValueError, UnicodeDecodeError):
|
||||
return candidate
|
||||
|
||||
|
||||
class DocumentTranslator(SendTranslator):
|
||||
"""Translator for Document schema objects (PDF docs etc.)"""
|
||||
|
||||
|
|
@ -49,8 +72,7 @@ class TextDocumentTranslator(SendTranslator):
|
|||
def to_pulsar(self, data: Dict[str, Any]) -> TextDocument:
|
||||
charset = data.get("charset", "utf-8")
|
||||
|
||||
# Text is base64 encoded in input
|
||||
text = base64.b64decode(data["text"]).decode(charset)
|
||||
text = _decode_text_payload(data["text"], charset)
|
||||
|
||||
from ...schema import Metadata
|
||||
return TextDocument(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue