mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-06-16 02:23:39 +02:00
feat: complete knowledge core storage — named graphs, provenance, source material (#973)
Implements all three changes from the knowledge-core-completeness tech spec: 1. Named graph field preserved through Cassandra storage (7-element tuple), enabling provenance triples to retain their graph URIs on round-trip. 2. Provenance triples already arrive on triples-input — no routing change needed; Change 1 was sufficient. 3. Source material (library documents) streamed alongside triples and embeddings during core download/upload. The knowledge manager fetches the document hierarchy from the librarian on download and recreates it on upload, preserving the full provenance chain across instances.
This commit is contained in:
parent
aa158e1ba3
commit
6df7471a55
14 changed files with 1347 additions and 15 deletions
|
|
@ -11,7 +11,12 @@ from unittest.mock import AsyncMock, Mock, patch, MagicMock
|
|||
from unittest.mock import call
|
||||
|
||||
from trustgraph.cores.knowledge import KnowledgeManager
|
||||
from trustgraph.schema import KnowledgeResponse, Triples, GraphEmbeddings, Metadata, Triple, Term, EntityEmbeddings, IRI, LITERAL
|
||||
from trustgraph.schema import (
|
||||
KnowledgeResponse, Triples, GraphEmbeddings, Metadata, Triple, Term,
|
||||
EntityEmbeddings, IRI, LITERAL,
|
||||
LibraryMetadata, LibraryBlob,
|
||||
LibrarianResponse, DocumentMetadata,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
|
@ -373,11 +378,252 @@ class TestKnowledgeManagerOtherMethods:
|
|||
mock_respond = AsyncMock()
|
||||
|
||||
await knowledge_manager.delete_kg_core(mock_request, mock_respond, "test-user")
|
||||
|
||||
|
||||
# Verify table store was called correctly
|
||||
knowledge_manager.table_store.delete_kg_core.assert_called_once_with("test-user", "test-doc-id")
|
||||
|
||||
|
||||
# Verify response
|
||||
mock_respond.assert_called_once()
|
||||
response = mock_respond.call_args[0][0]
|
||||
assert response.error is None
|
||||
assert response.error is None
|
||||
|
||||
|
||||
class TestKnowledgeManagerLibraryDownload:
|
||||
"""Test get_kg_core streaming of library documents."""
|
||||
|
||||
@pytest.fixture
|
||||
def manager_with_librarian(self, mock_flow_config):
|
||||
with patch('trustgraph.cores.knowledge.KnowledgeTableStore'):
|
||||
mock_librarian = AsyncMock()
|
||||
manager = KnowledgeManager(
|
||||
cassandra_host=["localhost"],
|
||||
cassandra_username="test_user",
|
||||
cassandra_password="test_pass",
|
||||
keyspace="test_keyspace",
|
||||
flow_config=mock_flow_config,
|
||||
librarian=mock_librarian,
|
||||
)
|
||||
manager.table_store = AsyncMock()
|
||||
return manager
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_kg_core_streams_library_docs(self, manager_with_librarian):
|
||||
mock_request = Mock()
|
||||
mock_request.id = "root-doc"
|
||||
mock_respond = AsyncMock()
|
||||
|
||||
manager_with_librarian.table_store.get_triples = AsyncMock()
|
||||
manager_with_librarian.table_store.get_graph_embeddings = AsyncMock()
|
||||
|
||||
root_meta = DocumentMetadata(
|
||||
id="root-doc", kind="application/pdf", title="Test PDF",
|
||||
document_type="source",
|
||||
)
|
||||
child_meta = DocumentMetadata(
|
||||
id="chunk-1", kind="text/plain", title="Chunk 1",
|
||||
parent_id="root-doc", document_type="chunk",
|
||||
)
|
||||
|
||||
manager_with_librarian.librarian.fetch_document_metadata.return_value = root_meta
|
||||
manager_with_librarian.librarian.request.return_value = LibrarianResponse(
|
||||
document_metadatas=[child_meta],
|
||||
)
|
||||
manager_with_librarian.librarian.fetch_document_content.side_effect = [
|
||||
b"cm9vdCBjb250ZW50",
|
||||
b"Y2h1bmsgY29udGVudA==",
|
||||
]
|
||||
|
||||
await manager_with_librarian.get_kg_core(
|
||||
mock_request, mock_respond, "test-user"
|
||||
)
|
||||
|
||||
responses = [c[0][0] for c in mock_respond.call_args_list]
|
||||
|
||||
lm_responses = [r for r in responses if r.library_metadata is not None]
|
||||
lb_responses = [r for r in responses if r.library_blob is not None]
|
||||
eos_responses = [r for r in responses if r.eos is True]
|
||||
|
||||
assert len(lm_responses) == 2
|
||||
assert lm_responses[0].library_metadata.id == "root-doc"
|
||||
assert lm_responses[0].library_metadata.document_type == "source"
|
||||
assert lm_responses[1].library_metadata.id == "chunk-1"
|
||||
assert lm_responses[1].library_metadata.parent_id == "root-doc"
|
||||
|
||||
assert len(lb_responses) == 2
|
||||
assert lb_responses[0].library_blob.id == "root-doc"
|
||||
assert lb_responses[0].library_blob.data == b"cm9vdCBjb250ZW50"
|
||||
assert lb_responses[1].library_blob.id == "chunk-1"
|
||||
|
||||
assert len(eos_responses) == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_kg_core_no_librarian_skips_library(self, mock_flow_config):
|
||||
with patch('trustgraph.cores.knowledge.KnowledgeTableStore'):
|
||||
manager = KnowledgeManager(
|
||||
cassandra_host=["localhost"],
|
||||
cassandra_username="u", cassandra_password="p",
|
||||
keyspace="ks", flow_config=mock_flow_config,
|
||||
)
|
||||
manager.table_store = AsyncMock()
|
||||
manager.table_store.get_triples = AsyncMock()
|
||||
manager.table_store.get_graph_embeddings = AsyncMock()
|
||||
|
||||
mock_request = Mock()
|
||||
mock_request.id = "doc-1"
|
||||
mock_respond = AsyncMock()
|
||||
|
||||
await manager.get_kg_core(mock_request, mock_respond, "w")
|
||||
|
||||
responses = [c[0][0] for c in mock_respond.call_args_list]
|
||||
assert all(r.library_metadata is None for r in responses)
|
||||
assert all(r.library_blob is None for r in responses)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_kg_core_librarian_metadata_failure_is_graceful(
|
||||
self, manager_with_librarian,
|
||||
):
|
||||
mock_request = Mock()
|
||||
mock_request.id = "missing-doc"
|
||||
mock_respond = AsyncMock()
|
||||
|
||||
manager_with_librarian.table_store.get_triples = AsyncMock()
|
||||
manager_with_librarian.table_store.get_graph_embeddings = AsyncMock()
|
||||
manager_with_librarian.librarian.fetch_document_metadata.side_effect = (
|
||||
RuntimeError("not found")
|
||||
)
|
||||
|
||||
await manager_with_librarian.get_kg_core(
|
||||
mock_request, mock_respond, "test-user"
|
||||
)
|
||||
|
||||
responses = [c[0][0] for c in mock_respond.call_args_list]
|
||||
assert all(r.library_metadata is None for r in responses)
|
||||
assert any(r.eos for r in responses)
|
||||
|
||||
|
||||
class TestKnowledgeManagerLibraryUpload:
|
||||
"""Test put_kg_core handling of library metadata and blob records."""
|
||||
|
||||
@pytest.fixture
|
||||
def manager_with_librarian(self, mock_flow_config):
|
||||
with patch('trustgraph.cores.knowledge.KnowledgeTableStore'):
|
||||
mock_librarian = AsyncMock()
|
||||
manager = KnowledgeManager(
|
||||
cassandra_host=["localhost"],
|
||||
cassandra_username="u", cassandra_password="p",
|
||||
keyspace="ks", flow_config=mock_flow_config,
|
||||
librarian=mock_librarian,
|
||||
)
|
||||
manager.table_store = AsyncMock()
|
||||
return manager
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_put_metadata_then_blob_calls_librarian(
|
||||
self, manager_with_librarian,
|
||||
):
|
||||
mock_respond = AsyncMock()
|
||||
manager_with_librarian.librarian.request.return_value = LibrarianResponse()
|
||||
|
||||
# First call: metadata
|
||||
req_meta = Mock()
|
||||
req_meta.triples = None
|
||||
req_meta.graph_embeddings = None
|
||||
req_meta.library_metadata = LibraryMetadata(
|
||||
id="doc-1", kind="application/pdf", title="Test",
|
||||
document_type="source",
|
||||
)
|
||||
req_meta.library_blob = None
|
||||
await manager_with_librarian.put_kg_core(req_meta, mock_respond, "ws")
|
||||
|
||||
# Metadata is buffered, librarian not called yet
|
||||
manager_with_librarian.librarian.request.assert_not_called()
|
||||
|
||||
# Second call: blob
|
||||
req_blob = Mock()
|
||||
req_blob.triples = None
|
||||
req_blob.graph_embeddings = None
|
||||
req_blob.library_metadata = None
|
||||
req_blob.library_blob = LibraryBlob(
|
||||
id="doc-1", data=b"dGVzdA==",
|
||||
)
|
||||
await manager_with_librarian.put_kg_core(req_blob, mock_respond, "ws")
|
||||
|
||||
# Now librarian should have been called with add-document
|
||||
manager_with_librarian.librarian.request.assert_called_once()
|
||||
call_args = manager_with_librarian.librarian.request.call_args[0][0]
|
||||
assert call_args.operation == "add-document"
|
||||
assert call_args.document_metadata.id == "doc-1"
|
||||
assert call_args.document_metadata.kind == "application/pdf"
|
||||
assert call_args.content == b"dGVzdA=="
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_put_child_document_uses_add_child_operation(
|
||||
self, manager_with_librarian,
|
||||
):
|
||||
mock_respond = AsyncMock()
|
||||
manager_with_librarian.librarian.request.return_value = LibrarianResponse()
|
||||
|
||||
req_meta = Mock()
|
||||
req_meta.triples = None
|
||||
req_meta.graph_embeddings = None
|
||||
req_meta.library_metadata = LibraryMetadata(
|
||||
id="chunk-1", kind="text/plain", title="Chunk",
|
||||
parent_id="doc-1", document_type="chunk",
|
||||
)
|
||||
req_meta.library_blob = None
|
||||
await manager_with_librarian.put_kg_core(req_meta, mock_respond, "ws")
|
||||
|
||||
req_blob = Mock()
|
||||
req_blob.triples = None
|
||||
req_blob.graph_embeddings = None
|
||||
req_blob.library_metadata = None
|
||||
req_blob.library_blob = LibraryBlob(id="chunk-1", data=b"Y2h1bms=")
|
||||
await manager_with_librarian.put_kg_core(req_blob, mock_respond, "ws")
|
||||
|
||||
call_args = manager_with_librarian.librarian.request.call_args[0][0]
|
||||
assert call_args.operation == "add-child-document"
|
||||
assert call_args.document_metadata.parent_id == "doc-1"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_put_blob_without_metadata_logs_warning(
|
||||
self, manager_with_librarian,
|
||||
):
|
||||
mock_respond = AsyncMock()
|
||||
|
||||
req_blob = Mock()
|
||||
req_blob.triples = None
|
||||
req_blob.graph_embeddings = None
|
||||
req_blob.library_metadata = None
|
||||
req_blob.library_blob = LibraryBlob(id="orphan", data=b"data")
|
||||
await manager_with_librarian.put_kg_core(req_blob, mock_respond, "ws")
|
||||
|
||||
# Librarian should not be called for orphan blob
|
||||
manager_with_librarian.librarian.request.assert_not_called()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_put_existing_document_is_graceful(
|
||||
self, manager_with_librarian,
|
||||
):
|
||||
mock_respond = AsyncMock()
|
||||
manager_with_librarian.librarian.request.side_effect = RuntimeError(
|
||||
"Document already exists"
|
||||
)
|
||||
|
||||
req_meta = Mock()
|
||||
req_meta.triples = None
|
||||
req_meta.graph_embeddings = None
|
||||
req_meta.library_metadata = LibraryMetadata(
|
||||
id="doc-1", kind="application/pdf", title="Test",
|
||||
document_type="source",
|
||||
)
|
||||
req_meta.library_blob = None
|
||||
await manager_with_librarian.put_kg_core(req_meta, mock_respond, "ws")
|
||||
|
||||
req_blob = Mock()
|
||||
req_blob.triples = None
|
||||
req_blob.graph_embeddings = None
|
||||
req_blob.library_metadata = None
|
||||
req_blob.library_blob = LibraryBlob(id="doc-1", data=b"data")
|
||||
await manager_with_librarian.put_kg_core(req_blob, mock_respond, "ws")
|
||||
|
||||
# Should not raise — "already exists" is handled gracefully
|
||||
|
|
@ -155,7 +155,7 @@ class TestGetTriples:
|
|||
@pytest.mark.asyncio
|
||||
@patch('trustgraph.tables.knowledge.async_execute_paged', new_callable=AsyncMock)
|
||||
async def test_row_converts_to_triples(self, mock_async_execute_paged):
|
||||
# row[3] is a list of (s_val, s_uri, p_val, p_uri, o_val, o_uri)
|
||||
# row[3] is a list of (s_val, s_uri, p_val, p_uri, o_val, o_uri, graph)
|
||||
fake_row = (
|
||||
None, None, None,
|
||||
[
|
||||
|
|
@ -163,6 +163,7 @@ class TestGetTriples:
|
|||
"http://example.org/alice", True,
|
||||
"http://example.org/knows", True,
|
||||
"http://example.org/bob", True,
|
||||
"urn:graph:source",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
|
@ -191,3 +192,33 @@ class TestGetTriples:
|
|||
assert t.s.iri == "http://example.org/alice"
|
||||
assert t.p.iri == "http://example.org/knows"
|
||||
assert t.o.iri == "http://example.org/bob"
|
||||
assert t.g == "urn:graph:source"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch('trustgraph.tables.knowledge.async_execute_paged', new_callable=AsyncMock)
|
||||
async def test_empty_graph_name_becomes_none(self, mock_async_execute_paged):
|
||||
fake_row = (
|
||||
None, None, None,
|
||||
[
|
||||
(
|
||||
"http://example.org/alice", True,
|
||||
"http://example.org/knows", True,
|
||||
"http://example.org/bob", True,
|
||||
"",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
store = _make_store()
|
||||
store.cassandra = Mock()
|
||||
store.get_triples_stmt = Mock()
|
||||
mock_async_execute_paged.return_value = [[fake_row]]
|
||||
|
||||
received = []
|
||||
|
||||
async def receiver(msg):
|
||||
received.append(msg)
|
||||
|
||||
await store.get_triples("w", "d", receiver)
|
||||
|
||||
assert received[0].triples[0].g is None
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
"""
|
||||
Round-trip unit tests for KnowledgeRequestTranslator.
|
||||
Round-trip unit tests for KnowledgeRequestTranslator and
|
||||
KnowledgeResponseTranslator.
|
||||
|
||||
Regression coverage: a previous version of the decode side constructed
|
||||
EntityEmbeddings(vectors=...) — the schema field is `vector` (singular),
|
||||
|
|
@ -15,9 +16,13 @@ Triples breaks the test.
|
|||
|
||||
import pytest
|
||||
|
||||
from trustgraph.messaging.translators.knowledge import KnowledgeRequestTranslator
|
||||
from trustgraph.messaging.translators.knowledge import (
|
||||
KnowledgeRequestTranslator,
|
||||
KnowledgeResponseTranslator,
|
||||
)
|
||||
from trustgraph.schema import (
|
||||
KnowledgeRequest,
|
||||
KnowledgeResponse,
|
||||
GraphEmbeddings,
|
||||
EntityEmbeddings,
|
||||
Triples,
|
||||
|
|
@ -25,6 +30,8 @@ from trustgraph.schema import (
|
|||
Metadata,
|
||||
Term,
|
||||
IRI,
|
||||
LibraryMetadata,
|
||||
LibraryBlob,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -145,3 +152,161 @@ class TestKnowledgeRequestTranslatorTriples:
|
|||
assert t.s.iri == "http://example.org/alice"
|
||||
assert t.p.iri == "http://example.org/knows"
|
||||
assert t.o.iri == "http://example.org/bob"
|
||||
|
||||
|
||||
class TestKnowledgeRequestTranslatorLibrary:
|
||||
|
||||
def test_roundtrip_preserves_library_metadata(self, translator):
|
||||
request = KnowledgeRequest(
|
||||
operation="put-kg-core",
|
||||
id="doc-1",
|
||||
library_metadata=LibraryMetadata(
|
||||
id="doc-1",
|
||||
kind="application/pdf",
|
||||
title="Test Document",
|
||||
parent_id="",
|
||||
document_type="source",
|
||||
comments="test comments",
|
||||
tags=["tag1", "tag2"],
|
||||
),
|
||||
)
|
||||
|
||||
encoded = translator.encode(request)
|
||||
assert "library-metadata" in encoded
|
||||
lm = encoded["library-metadata"]
|
||||
assert lm["id"] == "doc-1"
|
||||
assert lm["kind"] == "application/pdf"
|
||||
assert lm["title"] == "Test Document"
|
||||
assert lm["parent-id"] == ""
|
||||
assert lm["document-type"] == "source"
|
||||
assert lm["comments"] == "test comments"
|
||||
assert lm["tags"] == ["tag1", "tag2"]
|
||||
|
||||
decoded = translator.decode(encoded)
|
||||
assert decoded.library_metadata is not None
|
||||
assert decoded.library_metadata.id == "doc-1"
|
||||
assert decoded.library_metadata.kind == "application/pdf"
|
||||
assert decoded.library_metadata.title == "Test Document"
|
||||
assert decoded.library_metadata.parent_id == ""
|
||||
assert decoded.library_metadata.document_type == "source"
|
||||
assert decoded.library_metadata.comments == "test comments"
|
||||
assert decoded.library_metadata.tags == ["tag1", "tag2"]
|
||||
|
||||
def test_roundtrip_preserves_child_document_metadata(self, translator):
|
||||
request = KnowledgeRequest(
|
||||
operation="put-kg-core",
|
||||
id="doc-1",
|
||||
library_metadata=LibraryMetadata(
|
||||
id="chunk-1",
|
||||
kind="text/plain",
|
||||
title="Chunk 1",
|
||||
parent_id="doc-1",
|
||||
document_type="chunk",
|
||||
),
|
||||
)
|
||||
|
||||
encoded = translator.encode(request)
|
||||
decoded = translator.decode(encoded)
|
||||
|
||||
assert decoded.library_metadata.parent_id == "doc-1"
|
||||
assert decoded.library_metadata.document_type == "chunk"
|
||||
|
||||
def test_roundtrip_preserves_library_blob(self, translator):
|
||||
request = KnowledgeRequest(
|
||||
operation="put-kg-core",
|
||||
id="doc-1",
|
||||
library_blob=LibraryBlob(
|
||||
id="doc-1",
|
||||
data=b"SGVsbG8gV29ybGQ=",
|
||||
),
|
||||
)
|
||||
|
||||
encoded = translator.encode(request)
|
||||
assert "library-blob" in encoded
|
||||
assert encoded["library-blob"]["id"] == "doc-1"
|
||||
assert encoded["library-blob"]["data"] == "SGVsbG8gV29ybGQ="
|
||||
|
||||
decoded = translator.decode(encoded)
|
||||
assert decoded.library_blob is not None
|
||||
assert decoded.library_blob.id == "doc-1"
|
||||
assert decoded.library_blob.data == "SGVsbG8gV29ybGQ="
|
||||
|
||||
def test_absent_library_fields_decode_as_none(self, translator):
|
||||
decoded = translator.decode({
|
||||
"operation": "get-kg-core",
|
||||
"id": "doc-1",
|
||||
})
|
||||
assert decoded.library_metadata is None
|
||||
assert decoded.library_blob is None
|
||||
|
||||
|
||||
class TestKnowledgeResponseTranslatorLibrary:
|
||||
|
||||
@pytest.fixture
|
||||
def response_translator(self):
|
||||
return KnowledgeResponseTranslator()
|
||||
|
||||
def test_encode_library_metadata(self, response_translator):
|
||||
response = KnowledgeResponse(
|
||||
ids=None,
|
||||
library_metadata=LibraryMetadata(
|
||||
id="doc-1",
|
||||
kind="application/pdf",
|
||||
title="Test",
|
||||
parent_id="",
|
||||
document_type="source",
|
||||
comments="",
|
||||
tags=[],
|
||||
),
|
||||
)
|
||||
encoded = response_translator.encode(response)
|
||||
assert "library-metadata" in encoded
|
||||
assert encoded["library-metadata"]["id"] == "doc-1"
|
||||
assert encoded["library-metadata"]["kind"] == "application/pdf"
|
||||
assert encoded["library-metadata"]["document-type"] == "source"
|
||||
|
||||
def test_encode_library_blob_bytes_to_string(self, response_translator):
|
||||
response = KnowledgeResponse(
|
||||
ids=None,
|
||||
library_blob=LibraryBlob(
|
||||
id="doc-1",
|
||||
data=b"dGVzdCBkYXRh",
|
||||
),
|
||||
)
|
||||
encoded = response_translator.encode(response)
|
||||
assert "library-blob" in encoded
|
||||
assert encoded["library-blob"]["id"] == "doc-1"
|
||||
assert encoded["library-blob"]["data"] == "dGVzdCBkYXRh"
|
||||
assert isinstance(encoded["library-blob"]["data"], str)
|
||||
|
||||
def test_encode_library_blob_string_passthrough(self, response_translator):
|
||||
response = KnowledgeResponse(
|
||||
ids=None,
|
||||
library_blob=LibraryBlob(
|
||||
id="doc-1",
|
||||
data="already-a-string",
|
||||
),
|
||||
)
|
||||
encoded = response_translator.encode(response)
|
||||
assert encoded["library-blob"]["data"] == "already-a-string"
|
||||
|
||||
def test_library_metadata_is_not_final(self, response_translator):
|
||||
response = KnowledgeResponse(
|
||||
ids=None,
|
||||
library_metadata=LibraryMetadata(id="doc-1"),
|
||||
)
|
||||
_, is_final = response_translator.encode_with_completion(response)
|
||||
assert is_final is False
|
||||
|
||||
def test_library_blob_is_not_final(self, response_translator):
|
||||
response = KnowledgeResponse(
|
||||
ids=None,
|
||||
library_blob=LibraryBlob(id="doc-1", data=b"data"),
|
||||
)
|
||||
_, is_final = response_translator.encode_with_completion(response)
|
||||
assert is_final is False
|
||||
|
||||
def test_eos_is_final(self, response_translator):
|
||||
response = KnowledgeResponse(eos=True)
|
||||
_, is_final = response_translator.encode_with_completion(response)
|
||||
assert is_final is True
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue