mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
fix(gateway): accept raw utf-8 text in text-load (#729)
Co-authored-by: nanqinhu <139929317+nanqinhu@users.noreply.github.com>
This commit is contained in:
parent
5a9db2da50
commit
7af1d60db8
4 changed files with 91 additions and 9 deletions
|
|
@ -8,8 +8,7 @@ required:
|
||||||
properties:
|
properties:
|
||||||
text:
|
text:
|
||||||
type: string
|
type: string
|
||||||
description: Text content (base64 encoded)
|
description: Text content, either raw text or base64 encoded for compatibility with older clients
|
||||||
format: byte
|
|
||||||
example: VGhpcyBpcyB0aGUgZG9jdW1lbnQgdGV4dC4uLg==
|
example: VGhpcyBpcyB0aGUgZG9jdW1lbnQgdGV4dC4uLg==
|
||||||
id:
|
id:
|
||||||
type: string
|
type: string
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ post:
|
||||||
## Text Load Overview
|
## Text Load Overview
|
||||||
|
|
||||||
Fire-and-forget document loading:
|
Fire-and-forget document loading:
|
||||||
- **Input**: Text content (base64 encoded)
|
- **Input**: Text content (raw UTF-8 or base64 encoded)
|
||||||
- **Process**: Chunk, embed, store
|
- **Process**: Chunk, embed, store
|
||||||
- **Output**: None (202 Accepted)
|
- **Output**: None (202 Accepted)
|
||||||
|
|
||||||
|
|
@ -26,7 +26,14 @@ post:
|
||||||
|
|
||||||
## Text Format
|
## Text Format
|
||||||
|
|
||||||
Text must be base64 encoded:
|
Text may be sent as raw UTF-8 text:
|
||||||
|
```
|
||||||
|
{
|
||||||
|
"text": "Cancer survival: 2.74× higher hazard ratio"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Older clients may still send base64 encoded text:
|
||||||
```
|
```
|
||||||
text_content = "This is the document..."
|
text_content = "This is the document..."
|
||||||
encoded = base64.b64encode(text_content.encode('utf-8'))
|
encoded = base64.b64encode(text_content.encode('utf-8'))
|
||||||
|
|
@ -78,12 +85,12 @@ post:
|
||||||
simpleLoad:
|
simpleLoad:
|
||||||
summary: Load text document
|
summary: Load text document
|
||||||
value:
|
value:
|
||||||
text: VGhpcyBpcyB0aGUgZG9jdW1lbnQgdGV4dC4uLg==
|
text: This is the document text...
|
||||||
id: doc-123
|
id: doc-123
|
||||||
user: alice
|
user: alice
|
||||||
collection: research
|
collection: research
|
||||||
withMetadata:
|
withMetadata:
|
||||||
summary: Load with RDF metadata
|
summary: Load with RDF metadata using base64 text
|
||||||
value:
|
value:
|
||||||
text: UXVhbnR1bSBjb21wdXRpbmcgdXNlcyBxdWFudHVtIG1lY2hhbmljcyBwcmluY2lwbGVzLi4u
|
text: UXVhbnR1bSBjb21wdXRpbmcgdXNlcyBxdWFudHVtIG1lY2hhbmljcyBwcmluY2lwbGVzLi4u
|
||||||
id: doc-456
|
id: doc-456
|
||||||
|
|
|
||||||
54
tests/unit/test_gateway/test_text_document_translator.py
Normal file
54
tests/unit/test_gateway/test_text_document_translator.py
Normal file
|
|
@ -0,0 +1,54 @@
|
||||||
|
"""
|
||||||
|
Unit tests for text document gateway translation compatibility.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import base64
|
||||||
|
|
||||||
|
from trustgraph.messaging.translators.document_loading import TextDocumentTranslator
|
||||||
|
|
||||||
|
|
||||||
|
class TestTextDocumentTranslator:
|
||||||
|
def test_to_pulsar_decodes_base64_text(self):
|
||||||
|
translator = TextDocumentTranslator()
|
||||||
|
payload = "Cancer survival: 2.74× higher hazard ratio"
|
||||||
|
|
||||||
|
msg = translator.to_pulsar(
|
||||||
|
{
|
||||||
|
"id": "doc-1",
|
||||||
|
"user": "alice",
|
||||||
|
"collection": "research",
|
||||||
|
"charset": "utf-8",
|
||||||
|
"text": base64.b64encode(payload.encode("utf-8")).decode("ascii"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert msg.metadata.id == "doc-1"
|
||||||
|
assert msg.metadata.user == "alice"
|
||||||
|
assert msg.metadata.collection == "research"
|
||||||
|
assert msg.text == payload.encode("utf-8")
|
||||||
|
|
||||||
|
def test_to_pulsar_accepts_raw_utf8_text(self):
|
||||||
|
translator = TextDocumentTranslator()
|
||||||
|
payload = "Cancer survival: 2.74× higher hazard ratio"
|
||||||
|
|
||||||
|
msg = translator.to_pulsar(
|
||||||
|
{
|
||||||
|
"charset": "utf-8",
|
||||||
|
"text": payload,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert msg.text == payload.encode("utf-8")
|
||||||
|
|
||||||
|
def test_to_pulsar_falls_back_to_raw_non_base64_ascii(self):
|
||||||
|
translator = TextDocumentTranslator()
|
||||||
|
payload = "plain-text payload"
|
||||||
|
|
||||||
|
msg = translator.to_pulsar(
|
||||||
|
{
|
||||||
|
"charset": "utf-8",
|
||||||
|
"text": payload,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert msg.text == payload.encode("utf-8")
|
||||||
|
|
@ -4,6 +4,29 @@ from ...schema import Document, TextDocument, Chunk, DocumentEmbeddings, ChunkEm
|
||||||
from .base import SendTranslator
|
from .base import SendTranslator
|
||||||
|
|
||||||
|
|
||||||
|
def _decode_text_payload(payload: str | bytes, charset: str) -> str:
|
||||||
|
"""
|
||||||
|
Decode text-load payloads.
|
||||||
|
|
||||||
|
Historical clients send base64-encoded text, but direct REST callers may
|
||||||
|
send raw UTF-8 text. Support both so Unicode text-load requests do not fail
|
||||||
|
at the gateway translation layer.
|
||||||
|
"""
|
||||||
|
if isinstance(payload, bytes):
|
||||||
|
if not payload.isascii():
|
||||||
|
return payload.decode(charset)
|
||||||
|
candidate = payload.decode("ascii")
|
||||||
|
else:
|
||||||
|
if not payload.isascii():
|
||||||
|
return payload
|
||||||
|
candidate = payload
|
||||||
|
|
||||||
|
try:
|
||||||
|
return base64.b64decode(candidate, validate=True).decode(charset)
|
||||||
|
except (ValueError, UnicodeDecodeError):
|
||||||
|
return candidate
|
||||||
|
|
||||||
|
|
||||||
class DocumentTranslator(SendTranslator):
|
class DocumentTranslator(SendTranslator):
|
||||||
"""Translator for Document schema objects (PDF docs etc.)"""
|
"""Translator for Document schema objects (PDF docs etc.)"""
|
||||||
|
|
||||||
|
|
@ -49,8 +72,7 @@ class TextDocumentTranslator(SendTranslator):
|
||||||
def to_pulsar(self, data: Dict[str, Any]) -> TextDocument:
|
def to_pulsar(self, data: Dict[str, Any]) -> TextDocument:
|
||||||
charset = data.get("charset", "utf-8")
|
charset = data.get("charset", "utf-8")
|
||||||
|
|
||||||
# Text is base64 encoded in input
|
text = _decode_text_payload(data["text"], charset)
|
||||||
text = base64.b64decode(data["text"]).decode(charset)
|
|
||||||
|
|
||||||
from ...schema import Metadata
|
from ...schema import Metadata
|
||||||
return TextDocument(
|
return TextDocument(
|
||||||
|
|
@ -169,4 +191,4 @@ class DocumentEmbeddingsTranslator(SendTranslator):
|
||||||
|
|
||||||
result["metadata"] = metadata_dict
|
result["metadata"] = metadata_dict
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue