Protect from null embeddings in cores (#626)

* Don't emit graph embeddings if there aren't any.

* Don't store graph embeddings in a knowledge store if there's an empty list.

* Translate between Cassandra's 'null' representing an empty list and an
empty list which is what the surrounding code wants (and stored in the
first place).

* Avoid emitting empty embedding lists

* Avoid output empty triple lists

* Fix tests
This commit is contained in:
cybermaggedon 2026-02-09 14:07:07 +00:00 committed by GitHub
parent e214eb4e02
commit ca626c8471
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 88 additions and 91 deletions

View file

@ -16,7 +16,7 @@ from trustgraph.extract.kg.definitions.extract import Processor as DefinitionsPr
from trustgraph.extract.kg.relationships.extract import Processor as RelationshipsProcessor
from trustgraph.storage.knowledge.store import Processor as KnowledgeStoreProcessor
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Value, Error
from trustgraph.schema import EntityContext, EntityContexts, GraphEmbeddings
from trustgraph.schema import EntityContext, EntityContexts, GraphEmbeddings, EntityEmbeddings
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
@ -405,9 +405,14 @@ class TestKnowledgeGraphPipelineIntegration:
collection="test_collection",
metadata=[]
),
entities=[]
entities=[
EntityEmbeddings(
entity=Value(value="http://example.org/entity", is_uri=True),
vectors=[[0.1, 0.2, 0.3]]
)
]
)
mock_msg = MagicMock()
mock_msg.value.return_value = sample_embeddings
@ -496,12 +501,12 @@ class TestKnowledgeGraphPipelineIntegration:
await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context)
# Assert
# Should still call producers but with empty results
# Should NOT call producers with empty results (avoids Cassandra NULL issues)
triples_producer = mock_flow_context("triples")
entity_contexts_producer = mock_flow_context("entity-contexts")
triples_producer.send.assert_called_once()
entity_contexts_producer.send.assert_called_once()
triples_producer.send.assert_not_called()
entity_contexts_producer.send.assert_not_called()
@pytest.mark.asyncio
async def test_invalid_extraction_format_handling(self, definitions_processor, mock_flow_context, sample_chunk):