feat: complete knowledge core storage — named graphs, provenance, source material (#973)

Implements all three changes from the knowledge-core-completeness tech spec:

1. Named graph field preserved through Cassandra storage (7-element tuple),
   enabling provenance triples to retain their graph URIs on round-trip.

2. Provenance triples already arrive on triples-input — no routing change
   needed; Change 1 was sufficient.

3. Source material (library documents) streamed alongside triples and
   embeddings during core download/upload. The knowledge manager fetches
   the document hierarchy from the librarian on download and recreates it
   on upload, preserving the full provenance chain across instances.
This commit is contained in:
cybermaggedon 2026-06-03 10:46:52 +01:00 committed by GitHub
parent aa158e1ba3
commit 6df7471a55
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 1347 additions and 15 deletions

View file

@ -11,7 +11,12 @@ from unittest.mock import AsyncMock, Mock, patch, MagicMock
from unittest.mock import call
from trustgraph.cores.knowledge import KnowledgeManager
from trustgraph.schema import KnowledgeResponse, Triples, GraphEmbeddings, Metadata, Triple, Term, EntityEmbeddings, IRI, LITERAL
from trustgraph.schema import (
KnowledgeResponse, Triples, GraphEmbeddings, Metadata, Triple, Term,
EntityEmbeddings, IRI, LITERAL,
LibraryMetadata, LibraryBlob,
LibrarianResponse, DocumentMetadata,
)
@pytest.fixture
@ -373,11 +378,252 @@ class TestKnowledgeManagerOtherMethods:
mock_respond = AsyncMock()
await knowledge_manager.delete_kg_core(mock_request, mock_respond, "test-user")
# Verify table store was called correctly
knowledge_manager.table_store.delete_kg_core.assert_called_once_with("test-user", "test-doc-id")
# Verify response
mock_respond.assert_called_once()
response = mock_respond.call_args[0][0]
assert response.error is None
assert response.error is None
class TestKnowledgeManagerLibraryDownload:
"""Test get_kg_core streaming of library documents."""
@pytest.fixture
def manager_with_librarian(self, mock_flow_config):
with patch('trustgraph.cores.knowledge.KnowledgeTableStore'):
mock_librarian = AsyncMock()
manager = KnowledgeManager(
cassandra_host=["localhost"],
cassandra_username="test_user",
cassandra_password="test_pass",
keyspace="test_keyspace",
flow_config=mock_flow_config,
librarian=mock_librarian,
)
manager.table_store = AsyncMock()
return manager
@pytest.mark.asyncio
async def test_get_kg_core_streams_library_docs(self, manager_with_librarian):
mock_request = Mock()
mock_request.id = "root-doc"
mock_respond = AsyncMock()
manager_with_librarian.table_store.get_triples = AsyncMock()
manager_with_librarian.table_store.get_graph_embeddings = AsyncMock()
root_meta = DocumentMetadata(
id="root-doc", kind="application/pdf", title="Test PDF",
document_type="source",
)
child_meta = DocumentMetadata(
id="chunk-1", kind="text/plain", title="Chunk 1",
parent_id="root-doc", document_type="chunk",
)
manager_with_librarian.librarian.fetch_document_metadata.return_value = root_meta
manager_with_librarian.librarian.request.return_value = LibrarianResponse(
document_metadatas=[child_meta],
)
manager_with_librarian.librarian.fetch_document_content.side_effect = [
b"cm9vdCBjb250ZW50",
b"Y2h1bmsgY29udGVudA==",
]
await manager_with_librarian.get_kg_core(
mock_request, mock_respond, "test-user"
)
responses = [c[0][0] for c in mock_respond.call_args_list]
lm_responses = [r for r in responses if r.library_metadata is not None]
lb_responses = [r for r in responses if r.library_blob is not None]
eos_responses = [r for r in responses if r.eos is True]
assert len(lm_responses) == 2
assert lm_responses[0].library_metadata.id == "root-doc"
assert lm_responses[0].library_metadata.document_type == "source"
assert lm_responses[1].library_metadata.id == "chunk-1"
assert lm_responses[1].library_metadata.parent_id == "root-doc"
assert len(lb_responses) == 2
assert lb_responses[0].library_blob.id == "root-doc"
assert lb_responses[0].library_blob.data == b"cm9vdCBjb250ZW50"
assert lb_responses[1].library_blob.id == "chunk-1"
assert len(eos_responses) == 1
@pytest.mark.asyncio
async def test_get_kg_core_no_librarian_skips_library(self, mock_flow_config):
with patch('trustgraph.cores.knowledge.KnowledgeTableStore'):
manager = KnowledgeManager(
cassandra_host=["localhost"],
cassandra_username="u", cassandra_password="p",
keyspace="ks", flow_config=mock_flow_config,
)
manager.table_store = AsyncMock()
manager.table_store.get_triples = AsyncMock()
manager.table_store.get_graph_embeddings = AsyncMock()
mock_request = Mock()
mock_request.id = "doc-1"
mock_respond = AsyncMock()
await manager.get_kg_core(mock_request, mock_respond, "w")
responses = [c[0][0] for c in mock_respond.call_args_list]
assert all(r.library_metadata is None for r in responses)
assert all(r.library_blob is None for r in responses)
@pytest.mark.asyncio
async def test_get_kg_core_librarian_metadata_failure_is_graceful(
self, manager_with_librarian,
):
mock_request = Mock()
mock_request.id = "missing-doc"
mock_respond = AsyncMock()
manager_with_librarian.table_store.get_triples = AsyncMock()
manager_with_librarian.table_store.get_graph_embeddings = AsyncMock()
manager_with_librarian.librarian.fetch_document_metadata.side_effect = (
RuntimeError("not found")
)
await manager_with_librarian.get_kg_core(
mock_request, mock_respond, "test-user"
)
responses = [c[0][0] for c in mock_respond.call_args_list]
assert all(r.library_metadata is None for r in responses)
assert any(r.eos for r in responses)
class TestKnowledgeManagerLibraryUpload:
"""Test put_kg_core handling of library metadata and blob records."""
@pytest.fixture
def manager_with_librarian(self, mock_flow_config):
with patch('trustgraph.cores.knowledge.KnowledgeTableStore'):
mock_librarian = AsyncMock()
manager = KnowledgeManager(
cassandra_host=["localhost"],
cassandra_username="u", cassandra_password="p",
keyspace="ks", flow_config=mock_flow_config,
librarian=mock_librarian,
)
manager.table_store = AsyncMock()
return manager
@pytest.mark.asyncio
async def test_put_metadata_then_blob_calls_librarian(
self, manager_with_librarian,
):
mock_respond = AsyncMock()
manager_with_librarian.librarian.request.return_value = LibrarianResponse()
# First call: metadata
req_meta = Mock()
req_meta.triples = None
req_meta.graph_embeddings = None
req_meta.library_metadata = LibraryMetadata(
id="doc-1", kind="application/pdf", title="Test",
document_type="source",
)
req_meta.library_blob = None
await manager_with_librarian.put_kg_core(req_meta, mock_respond, "ws")
# Metadata is buffered, librarian not called yet
manager_with_librarian.librarian.request.assert_not_called()
# Second call: blob
req_blob = Mock()
req_blob.triples = None
req_blob.graph_embeddings = None
req_blob.library_metadata = None
req_blob.library_blob = LibraryBlob(
id="doc-1", data=b"dGVzdA==",
)
await manager_with_librarian.put_kg_core(req_blob, mock_respond, "ws")
# Now librarian should have been called with add-document
manager_with_librarian.librarian.request.assert_called_once()
call_args = manager_with_librarian.librarian.request.call_args[0][0]
assert call_args.operation == "add-document"
assert call_args.document_metadata.id == "doc-1"
assert call_args.document_metadata.kind == "application/pdf"
assert call_args.content == b"dGVzdA=="
@pytest.mark.asyncio
async def test_put_child_document_uses_add_child_operation(
self, manager_with_librarian,
):
mock_respond = AsyncMock()
manager_with_librarian.librarian.request.return_value = LibrarianResponse()
req_meta = Mock()
req_meta.triples = None
req_meta.graph_embeddings = None
req_meta.library_metadata = LibraryMetadata(
id="chunk-1", kind="text/plain", title="Chunk",
parent_id="doc-1", document_type="chunk",
)
req_meta.library_blob = None
await manager_with_librarian.put_kg_core(req_meta, mock_respond, "ws")
req_blob = Mock()
req_blob.triples = None
req_blob.graph_embeddings = None
req_blob.library_metadata = None
req_blob.library_blob = LibraryBlob(id="chunk-1", data=b"Y2h1bms=")
await manager_with_librarian.put_kg_core(req_blob, mock_respond, "ws")
call_args = manager_with_librarian.librarian.request.call_args[0][0]
assert call_args.operation == "add-child-document"
assert call_args.document_metadata.parent_id == "doc-1"
@pytest.mark.asyncio
async def test_put_blob_without_metadata_logs_warning(
self, manager_with_librarian,
):
mock_respond = AsyncMock()
req_blob = Mock()
req_blob.triples = None
req_blob.graph_embeddings = None
req_blob.library_metadata = None
req_blob.library_blob = LibraryBlob(id="orphan", data=b"data")
await manager_with_librarian.put_kg_core(req_blob, mock_respond, "ws")
# Librarian should not be called for orphan blob
manager_with_librarian.librarian.request.assert_not_called()
@pytest.mark.asyncio
async def test_put_existing_document_is_graceful(
self, manager_with_librarian,
):
mock_respond = AsyncMock()
manager_with_librarian.librarian.request.side_effect = RuntimeError(
"Document already exists"
)
req_meta = Mock()
req_meta.triples = None
req_meta.graph_embeddings = None
req_meta.library_metadata = LibraryMetadata(
id="doc-1", kind="application/pdf", title="Test",
document_type="source",
)
req_meta.library_blob = None
await manager_with_librarian.put_kg_core(req_meta, mock_respond, "ws")
req_blob = Mock()
req_blob.triples = None
req_blob.graph_embeddings = None
req_blob.library_metadata = None
req_blob.library_blob = LibraryBlob(id="doc-1", data=b"data")
await manager_with_librarian.put_kg_core(req_blob, mock_respond, "ws")
# Should not raise — "already exists" is handled gracefully

View file

@ -155,7 +155,7 @@ class TestGetTriples:
@pytest.mark.asyncio
@patch('trustgraph.tables.knowledge.async_execute_paged', new_callable=AsyncMock)
async def test_row_converts_to_triples(self, mock_async_execute_paged):
# row[3] is a list of (s_val, s_uri, p_val, p_uri, o_val, o_uri)
# row[3] is a list of (s_val, s_uri, p_val, p_uri, o_val, o_uri, graph)
fake_row = (
None, None, None,
[
@ -163,6 +163,7 @@ class TestGetTriples:
"http://example.org/alice", True,
"http://example.org/knows", True,
"http://example.org/bob", True,
"urn:graph:source",
),
],
)
@ -191,3 +192,33 @@ class TestGetTriples:
assert t.s.iri == "http://example.org/alice"
assert t.p.iri == "http://example.org/knows"
assert t.o.iri == "http://example.org/bob"
assert t.g == "urn:graph:source"
@pytest.mark.asyncio
@patch('trustgraph.tables.knowledge.async_execute_paged', new_callable=AsyncMock)
async def test_empty_graph_name_becomes_none(self, mock_async_execute_paged):
fake_row = (
None, None, None,
[
(
"http://example.org/alice", True,
"http://example.org/knows", True,
"http://example.org/bob", True,
"",
),
],
)
store = _make_store()
store.cassandra = Mock()
store.get_triples_stmt = Mock()
mock_async_execute_paged.return_value = [[fake_row]]
received = []
async def receiver(msg):
received.append(msg)
await store.get_triples("w", "d", receiver)
assert received[0].triples[0].g is None

View file

@ -1,5 +1,6 @@
"""
Round-trip unit tests for KnowledgeRequestTranslator.
Round-trip unit tests for KnowledgeRequestTranslator and
KnowledgeResponseTranslator.
Regression coverage: a previous version of the decode side constructed
EntityEmbeddings(vectors=...) the schema field is `vector` (singular),
@ -15,9 +16,13 @@ Triples breaks the test.
import pytest
from trustgraph.messaging.translators.knowledge import KnowledgeRequestTranslator
from trustgraph.messaging.translators.knowledge import (
KnowledgeRequestTranslator,
KnowledgeResponseTranslator,
)
from trustgraph.schema import (
KnowledgeRequest,
KnowledgeResponse,
GraphEmbeddings,
EntityEmbeddings,
Triples,
@ -25,6 +30,8 @@ from trustgraph.schema import (
Metadata,
Term,
IRI,
LibraryMetadata,
LibraryBlob,
)
@ -145,3 +152,161 @@ class TestKnowledgeRequestTranslatorTriples:
assert t.s.iri == "http://example.org/alice"
assert t.p.iri == "http://example.org/knows"
assert t.o.iri == "http://example.org/bob"
class TestKnowledgeRequestTranslatorLibrary:
def test_roundtrip_preserves_library_metadata(self, translator):
request = KnowledgeRequest(
operation="put-kg-core",
id="doc-1",
library_metadata=LibraryMetadata(
id="doc-1",
kind="application/pdf",
title="Test Document",
parent_id="",
document_type="source",
comments="test comments",
tags=["tag1", "tag2"],
),
)
encoded = translator.encode(request)
assert "library-metadata" in encoded
lm = encoded["library-metadata"]
assert lm["id"] == "doc-1"
assert lm["kind"] == "application/pdf"
assert lm["title"] == "Test Document"
assert lm["parent-id"] == ""
assert lm["document-type"] == "source"
assert lm["comments"] == "test comments"
assert lm["tags"] == ["tag1", "tag2"]
decoded = translator.decode(encoded)
assert decoded.library_metadata is not None
assert decoded.library_metadata.id == "doc-1"
assert decoded.library_metadata.kind == "application/pdf"
assert decoded.library_metadata.title == "Test Document"
assert decoded.library_metadata.parent_id == ""
assert decoded.library_metadata.document_type == "source"
assert decoded.library_metadata.comments == "test comments"
assert decoded.library_metadata.tags == ["tag1", "tag2"]
def test_roundtrip_preserves_child_document_metadata(self, translator):
request = KnowledgeRequest(
operation="put-kg-core",
id="doc-1",
library_metadata=LibraryMetadata(
id="chunk-1",
kind="text/plain",
title="Chunk 1",
parent_id="doc-1",
document_type="chunk",
),
)
encoded = translator.encode(request)
decoded = translator.decode(encoded)
assert decoded.library_metadata.parent_id == "doc-1"
assert decoded.library_metadata.document_type == "chunk"
def test_roundtrip_preserves_library_blob(self, translator):
request = KnowledgeRequest(
operation="put-kg-core",
id="doc-1",
library_blob=LibraryBlob(
id="doc-1",
data=b"SGVsbG8gV29ybGQ=",
),
)
encoded = translator.encode(request)
assert "library-blob" in encoded
assert encoded["library-blob"]["id"] == "doc-1"
assert encoded["library-blob"]["data"] == "SGVsbG8gV29ybGQ="
decoded = translator.decode(encoded)
assert decoded.library_blob is not None
assert decoded.library_blob.id == "doc-1"
assert decoded.library_blob.data == "SGVsbG8gV29ybGQ="
def test_absent_library_fields_decode_as_none(self, translator):
decoded = translator.decode({
"operation": "get-kg-core",
"id": "doc-1",
})
assert decoded.library_metadata is None
assert decoded.library_blob is None
class TestKnowledgeResponseTranslatorLibrary:
@pytest.fixture
def response_translator(self):
return KnowledgeResponseTranslator()
def test_encode_library_metadata(self, response_translator):
response = KnowledgeResponse(
ids=None,
library_metadata=LibraryMetadata(
id="doc-1",
kind="application/pdf",
title="Test",
parent_id="",
document_type="source",
comments="",
tags=[],
),
)
encoded = response_translator.encode(response)
assert "library-metadata" in encoded
assert encoded["library-metadata"]["id"] == "doc-1"
assert encoded["library-metadata"]["kind"] == "application/pdf"
assert encoded["library-metadata"]["document-type"] == "source"
def test_encode_library_blob_bytes_to_string(self, response_translator):
response = KnowledgeResponse(
ids=None,
library_blob=LibraryBlob(
id="doc-1",
data=b"dGVzdCBkYXRh",
),
)
encoded = response_translator.encode(response)
assert "library-blob" in encoded
assert encoded["library-blob"]["id"] == "doc-1"
assert encoded["library-blob"]["data"] == "dGVzdCBkYXRh"
assert isinstance(encoded["library-blob"]["data"], str)
def test_encode_library_blob_string_passthrough(self, response_translator):
response = KnowledgeResponse(
ids=None,
library_blob=LibraryBlob(
id="doc-1",
data="already-a-string",
),
)
encoded = response_translator.encode(response)
assert encoded["library-blob"]["data"] == "already-a-string"
def test_library_metadata_is_not_final(self, response_translator):
response = KnowledgeResponse(
ids=None,
library_metadata=LibraryMetadata(id="doc-1"),
)
_, is_final = response_translator.encode_with_completion(response)
assert is_final is False
def test_library_blob_is_not_final(self, response_translator):
response = KnowledgeResponse(
ids=None,
library_blob=LibraryBlob(id="doc-1", data=b"data"),
)
_, is_final = response_translator.encode_with_completion(response)
assert is_final is False
def test_eos_is_final(self, response_translator):
response = KnowledgeResponse(eos=True)
_, is_final = response_translator.encode_with_completion(response)
assert is_final is True