Fix Metadata/EntityEmbeddings schema migration tail and add regression tests (#777)

The Metadata dataclass dropped its `metadata: list[Triple]` field
and EntityEmbeddings/ChunkEmbeddings settled on a singular
`vector: list[float]` field, but several call sites kept passing
`Metadata(metadata=...)` and `EntityEmbeddings(vectors=...)`. The
bugs were latent until a websocket client first hit
`/api/v1/flow/default/import/entity-contexts`, at which point the
dispatcher TypeError'd on construction.

Production fixes (5 call sites on the same migration tail):

  * trustgraph-flow gateway dispatchers entity_contexts_import.py
    and graph_embeddings_import.py — drop the stale
    Metadata(metadata=...)  kwarg; switch graph_embeddings_import
    to the singular `vector` wire key.
  * trustgraph-base messaging translators knowledge.py and
    document_loading.py — fix decode side to read the singular
    `"vector"` key, matching what their own encode sides have
    always written.
  * trustgraph-flow tables/knowledge.py — fix Cassandra row
    deserialiser to construct EntityEmbeddings(vector=...)
    instead of vectors=.
  * trustgraph-flow gateway core_import/core_export — switch the
    kg-core msgpack wire format to the singular `"v"`/`"vector"`
    key and drop the dead `m["m"]` envelope field that referenced
    the removed Metadata.metadata triples list (it was a
    guaranteed KeyError on the export side).

Defense-in-depth regression coverage (32 new tests across 7 files):

  * tests/contract/test_schema_field_contracts.py — pin the field
    set of Metadata, EntityEmbeddings, ChunkEmbeddings,
    EntityContext so any future schema rename fails CI loudly
    with a clear diff.
  * tests/unit/test_translators/test_knowledge_translator_roundtrip.py
    and test_document_embeddings_translator_roundtrip.py -
    encode→decode round-trip the affected translators end to end,
    locking in the singular `"vector"` wire key.
  * tests/unit/test_gateway/test_entity_contexts_import_dispatcher.py
    and test_graph_embeddings_import_dispatcher.py — exercise the
    websocket dispatchers' receive() path with realistic
    payloads, the direct regression test for the original
    production crash.
  * tests/unit/test_gateway/test_core_import_export_roundtrip.py
    — pack/unpack the kg-core msgpack format through the real
    dispatcher classes (with KnowledgeRequestor mocked),
    including a full export→import round-trip.
  * tests/unit/test_tables/test_knowledge_table_store.py —
    exercise the Cassandra row → schema conversion via __new__ to
    bypass the live cluster connection.

Also fixes an unrelated leaked-coroutine RuntimeWarning in
test_gateway/test_service.py::test_run_method_calls_web_run_app: the
mocked aiohttp.web.run_app now closes the coroutine that Api.run() hands
it, mirroring what the real run_app would do, instead of leaving it for
the GC to complain about.
This commit is contained in:
cybermaggedon 2026-04-10 20:43:45 +01:00 committed by GitHub
parent 0994d4b05f
commit c23e28aa66
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 1415 additions and 17 deletions

View file

View file

@ -0,0 +1,66 @@
"""
Round-trip unit tests for DocumentEmbeddingsTranslator.
Regression coverage: a previous version of the decode side constructed
ChunkEmbeddings(vectors=...) the schema field is `vector` (singular),
so any real DocumentEmbeddings message would crash on decode. The encode
side already wrote `"vector"`, so encodedecode was asymmetric.
"""
import pytest
from trustgraph.messaging.translators.document_loading import (
DocumentEmbeddingsTranslator,
)
from trustgraph.schema import (
DocumentEmbeddings,
ChunkEmbeddings,
Metadata,
)
@pytest.fixture
def translator():
return DocumentEmbeddingsTranslator()
@pytest.fixture
def sample():
return DocumentEmbeddings(
metadata=Metadata(
id="doc-1",
root="",
user="alice",
collection="testcoll",
),
chunks=[
ChunkEmbeddings(chunk_id="c1", vector=[0.1, 0.2, 0.3]),
ChunkEmbeddings(chunk_id="c2", vector=[0.4, 0.5, 0.6]),
],
)
class TestDocumentEmbeddingsTranslator:
def test_encode_uses_singular_vector_key(self, translator, sample):
encoded = translator.encode(sample)
chunks = encoded["chunks"]
assert all("vector" in c for c in chunks)
assert all("vectors" not in c for c in chunks)
assert chunks[0]["vector"] == [0.1, 0.2, 0.3]
def test_roundtrip_preserves_document_embeddings(self, translator, sample):
encoded = translator.encode(sample)
decoded = translator.decode(encoded)
assert isinstance(decoded, DocumentEmbeddings)
assert isinstance(decoded.metadata, Metadata)
assert decoded.metadata.id == "doc-1"
assert decoded.metadata.user == "alice"
assert decoded.metadata.collection == "testcoll"
assert len(decoded.chunks) == 2
assert decoded.chunks[0].chunk_id == "c1"
assert decoded.chunks[0].vector == [0.1, 0.2, 0.3]
assert decoded.chunks[1].chunk_id == "c2"
assert decoded.chunks[1].vector == [0.4, 0.5, 0.6]

View file

@ -0,0 +1,153 @@
"""
Round-trip unit tests for KnowledgeRequestTranslator.
Regression coverage: a previous version of the decode side constructed
EntityEmbeddings(vectors=...) the schema field is `vector` (singular),
so any real graph-embeddings KnowledgeRequest would crash on first
message. The encode side already wrote `"vector"`, so encodedecode was
asymmetric.
These tests build a real KnowledgeRequest with graph-embeddings, encode
it, decode the result, and assert the round-trip is lossless. They also
exercise the triples path so any future schema drift in Metadata or
Triples breaks the test.
"""
import pytest
from trustgraph.messaging.translators.knowledge import KnowledgeRequestTranslator
from trustgraph.schema import (
KnowledgeRequest,
GraphEmbeddings,
EntityEmbeddings,
Triples,
Triple,
Metadata,
Term,
IRI,
)
def _term_iri(uri):
return Term(type=IRI, iri=uri)
@pytest.fixture
def translator():
return KnowledgeRequestTranslator()
@pytest.fixture
def graph_embeddings_request():
return KnowledgeRequest(
operation="put-kg-core",
user="alice",
id="doc-1",
flow="default",
collection="testcoll",
graph_embeddings=GraphEmbeddings(
metadata=Metadata(
id="doc-1",
root="",
user="alice",
collection="testcoll",
),
entities=[
EntityEmbeddings(
entity=_term_iri("http://example.org/alice"),
vector=[0.1, 0.2, 0.3],
),
EntityEmbeddings(
entity=_term_iri("http://example.org/bob"),
vector=[0.4, 0.5, 0.6],
),
],
),
)
@pytest.fixture
def triples_request():
return KnowledgeRequest(
operation="put-kg-core",
user="alice",
id="doc-1",
flow="default",
collection="testcoll",
triples=Triples(
metadata=Metadata(
id="doc-1",
root="",
user="alice",
collection="testcoll",
),
triples=[
Triple(
s=_term_iri("http://example.org/alice"),
p=_term_iri("http://example.org/knows"),
o=_term_iri("http://example.org/bob"),
),
],
),
)
class TestKnowledgeRequestTranslatorGraphEmbeddings:
def test_encode_produces_singular_vector_key(
self, translator, graph_embeddings_request,
):
"""The wire key must be `vector`, never `vectors`."""
encoded = translator.encode(graph_embeddings_request)
entities = encoded["graph-embeddings"]["entities"]
assert all("vector" in e for e in entities)
assert all("vectors" not in e for e in entities)
assert entities[0]["vector"] == [0.1, 0.2, 0.3]
def test_roundtrip_preserves_graph_embeddings(
self, translator, graph_embeddings_request,
):
"""encode -> decode must be lossless for the GE branch."""
encoded = translator.encode(graph_embeddings_request)
decoded = translator.decode(encoded)
assert isinstance(decoded, KnowledgeRequest)
assert decoded.operation == "put-kg-core"
assert decoded.user == "alice"
assert decoded.id == "doc-1"
assert decoded.flow == "default"
assert decoded.collection == "testcoll"
assert decoded.graph_embeddings is not None
ge = decoded.graph_embeddings
assert isinstance(ge, GraphEmbeddings)
assert isinstance(ge.metadata, Metadata)
assert ge.metadata.id == "doc-1"
assert ge.metadata.user == "alice"
assert ge.metadata.collection == "testcoll"
assert len(ge.entities) == 2
assert ge.entities[0].vector == [0.1, 0.2, 0.3]
assert ge.entities[1].vector == [0.4, 0.5, 0.6]
assert ge.entities[0].entity.iri == "http://example.org/alice"
assert ge.entities[1].entity.iri == "http://example.org/bob"
class TestKnowledgeRequestTranslatorTriples:
def test_roundtrip_preserves_triples(self, translator, triples_request):
encoded = translator.encode(triples_request)
decoded = translator.decode(encoded)
assert isinstance(decoded, KnowledgeRequest)
assert decoded.triples is not None
assert isinstance(decoded.triples.metadata, Metadata)
assert decoded.triples.metadata.id == "doc-1"
assert decoded.triples.metadata.user == "alice"
assert decoded.triples.metadata.collection == "testcoll"
assert len(decoded.triples.triples) == 1
t = decoded.triples.triples[0]
assert t.s.iri == "http://example.org/alice"
assert t.p.iri == "http://example.org/knows"
assert t.o.iri == "http://example.org/bob"