Fix Metadata/EntityEmbeddings schema migration tail and add regression tests (#777)

The Metadata dataclass dropped its `metadata: list[Triple]` field
and EntityEmbeddings/ChunkEmbeddings settled on a singular
`vector: list[float]` field, but several call sites kept passing
`Metadata(metadata=...)` and `EntityEmbeddings(vectors=...)`. The
bugs were latent until a websocket client first hit
`/api/v1/flow/default/import/entity-contexts`, at which point the
dispatcher TypeError'd on construction.

Production fixes (5 call sites on the same migration tail):

  * trustgraph-flow gateway dispatchers entity_contexts_import.py
    and graph_embeddings_import.py — drop the stale
    Metadata(metadata=...)  kwarg; switch graph_embeddings_import
    to the singular `vector` wire key.
  * trustgraph-base messaging translators knowledge.py and
    document_loading.py — fix decode side to read the singular
    `"vector"` key, matching what their own encode sides have
    always written.
  * trustgraph-flow tables/knowledge.py — fix Cassandra row
    deserialiser to construct EntityEmbeddings(vector=...)
    instead of vectors=.
  * trustgraph-flow gateway core_import/core_export — switch the
    kg-core msgpack wire format to the singular `"v"`/`"vector"`
    key and drop the dead `m["m"]` envelope field that referenced
    the removed Metadata.metadata triples list (it was a
    guaranteed KeyError on the export side).

Defense-in-depth regression coverage (32 new tests across 7 files):

  * tests/contract/test_schema_field_contracts.py — pin the field
    set of Metadata, EntityEmbeddings, ChunkEmbeddings,
    EntityContext so any future schema rename fails CI loudly
    with a clear diff.
  * tests/unit/test_translators/test_knowledge_translator_roundtrip.py
    and test_document_embeddings_translator_roundtrip.py -
    encode→decode round-trip the affected translators end to end,
    locking in the singular `"vector"` wire key.
  * tests/unit/test_gateway/test_entity_contexts_import_dispatcher.py
    and test_graph_embeddings_import_dispatcher.py — exercise the
    websocket dispatchers' receive() path with realistic
    payloads, the direct regression test for the original
    production crash.
  * tests/unit/test_gateway/test_core_import_export_roundtrip.py
    — pack/unpack the kg-core msgpack format through the real
    dispatcher classes (with KnowledgeRequestor mocked),
    including a full export→import round-trip.
  * tests/unit/test_tables/test_knowledge_table_store.py —
    exercise the Cassandra row → schema conversion via __new__ to
    bypass the live cluster connection.

Also fixes an unrelated leaked-coroutine RuntimeWarning in
test_gateway/test_service.py::test_run_method_calls_web_run_app: the
mocked aiohttp.web.run_app now closes the coroutine that Api.run() hands
it, mirroring what the real run_app would do, instead of leaving it for
the GC to complain about.
This commit is contained in:
cybermaggedon 2026-04-10 20:43:45 +01:00 committed by GitHub
parent 0994d4b05f
commit c23e28aa66
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 1415 additions and 17 deletions

View file

View file

@ -0,0 +1,197 @@
"""
Unit tests for KnowledgeTableStore row deserialization.
Regression coverage: a previous version of get_graph_embeddings constructed
EntityEmbeddings(vectors=ent[1]) the schema field is `vector` (singular),
so any real Cassandra row would crash on read. These tests bypass the live
Cassandra connection entirely and exercise the row -> schema conversion
with hand-built fake rows.
"""
import pytest
from unittest.mock import Mock
from trustgraph.tables.knowledge import KnowledgeTableStore
from trustgraph.schema import (
EntityEmbeddings,
GraphEmbeddings,
Triples,
Triple,
Metadata,
IRI,
LITERAL,
)
def _make_store():
"""
Build a KnowledgeTableStore without invoking __init__ (which connects
to Cassandra). Tests inject only the attributes the method under test
actually touches.
"""
return KnowledgeTableStore.__new__(KnowledgeTableStore)
class TestGetGraphEmbeddings:
@pytest.mark.asyncio
async def test_row_converts_to_entity_embeddings_with_singular_vector(self):
"""
Cassandra rows return entities as a list of [entity_tuple, vector]
pairs in row[3]. The deserializer must construct EntityEmbeddings
with `vector=` (singular) the schema field name. A previous
version used `vectors=` and TypeError'd at runtime.
"""
# Arrange — fake row matching the get_triples_stmt result shape:
# row[0..2] are unused by the method, row[3] is the entities blob
fake_row = (
None, None, None,
[
# ((value, is_uri), vector)
(("http://example.org/alice", True), [0.1, 0.2, 0.3]),
(("http://example.org/bob", True), [0.4, 0.5, 0.6]),
(("a literal entity", False), [0.7, 0.8, 0.9]),
],
)
store = _make_store()
store.cassandra = Mock()
store.cassandra.execute = Mock(return_value=[fake_row])
store.get_graph_embeddings_stmt = Mock()
received = []
async def receiver(msg):
received.append(msg)
# Act
await store.get_graph_embeddings(
user="alice",
document_id="doc-1",
receiver=receiver,
)
# Assert
store.cassandra.execute.assert_called_once_with(
store.get_graph_embeddings_stmt,
("alice", "doc-1"),
)
assert len(received) == 1
ge = received[0]
assert isinstance(ge, GraphEmbeddings)
assert isinstance(ge.metadata, Metadata)
assert ge.metadata.id == "doc-1"
assert ge.metadata.user == "alice"
assert len(ge.entities) == 3
assert all(isinstance(e, EntityEmbeddings) for e in ge.entities)
# Vectors land in the singular `vector` field — this is the
# explicit regression assertion for the original bug.
assert ge.entities[0].vector == [0.1, 0.2, 0.3]
assert ge.entities[1].vector == [0.4, 0.5, 0.6]
assert ge.entities[2].vector == [0.7, 0.8, 0.9]
# Term type round-trips through tuple_to_term
assert ge.entities[0].entity.type == IRI
assert ge.entities[0].entity.iri == "http://example.org/alice"
assert ge.entities[1].entity.type == IRI
assert ge.entities[1].entity.iri == "http://example.org/bob"
assert ge.entities[2].entity.type == LITERAL
assert ge.entities[2].entity.value == "a literal entity"
@pytest.mark.asyncio
async def test_empty_entities_blob_yields_empty_list(self):
"""row[3] being None / empty must produce a GraphEmbeddings with
no entities, not raise."""
fake_row = (None, None, None, None)
store = _make_store()
store.cassandra = Mock()
store.cassandra.execute = Mock(return_value=[fake_row])
store.get_graph_embeddings_stmt = Mock()
received = []
async def receiver(msg):
received.append(msg)
await store.get_graph_embeddings("u", "d", receiver)
assert len(received) == 1
assert received[0].entities == []
@pytest.mark.asyncio
async def test_multiple_rows_each_emit_one_message(self):
fake_rows = [
(None, None, None, [
(("http://example.org/a", True), [1.0]),
]),
(None, None, None, [
(("http://example.org/b", True), [2.0]),
]),
]
store = _make_store()
store.cassandra = Mock()
store.cassandra.execute = Mock(return_value=fake_rows)
store.get_graph_embeddings_stmt = Mock()
received = []
async def receiver(msg):
received.append(msg)
await store.get_graph_embeddings("u", "d", receiver)
assert len(received) == 2
assert received[0].entities[0].entity.iri == "http://example.org/a"
assert received[0].entities[0].vector == [1.0]
assert received[1].entities[0].entity.iri == "http://example.org/b"
assert received[1].entities[0].vector == [2.0]
class TestGetTriples:
"""Bonus: the sibling get_triples path uses the same row[3] shape and
the same Metadata construction. Cover it for parity."""
@pytest.mark.asyncio
async def test_row_converts_to_triples(self):
# row[3] is a list of (s_val, s_uri, p_val, p_uri, o_val, o_uri)
fake_row = (
None, None, None,
[
(
"http://example.org/alice", True,
"http://example.org/knows", True,
"http://example.org/bob", True,
),
],
)
store = _make_store()
store.cassandra = Mock()
store.cassandra.execute = Mock(return_value=[fake_row])
store.get_triples_stmt = Mock()
received = []
async def receiver(msg):
received.append(msg)
await store.get_triples("alice", "doc-1", receiver)
assert len(received) == 1
triples_msg = received[0]
assert isinstance(triples_msg, Triples)
assert isinstance(triples_msg.metadata, Metadata)
assert triples_msg.metadata.id == "doc-1"
assert triples_msg.metadata.user == "alice"
assert len(triples_msg.triples) == 1
t = triples_msg.triples[0]
assert isinstance(t, Triple)
assert t.s.iri == "http://example.org/alice"
assert t.p.iri == "http://example.org/knows"
assert t.o.iri == "http://example.org/bob"