mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-09 07:12:37 +02:00
Protect null embeddings - v2.0 (#627)
* Don't emit graph embeddings if there aren't any. * Don't store graph embeddings in a knowledge store if there's an empty list. * Translate between Cassandra's 'null' representing an empty list and an empty list which is what the surrounding code wants (and stored in the first place). * Avoid emitting empty embedding lists * Avoid output empty triple lists * Fix tests
This commit is contained in:
parent
98827e5561
commit
8574861196
7 changed files with 88 additions and 91 deletions
|
|
@ -16,7 +16,7 @@ from trustgraph.extract.kg.definitions.extract import Processor as DefinitionsPr
|
|||
from trustgraph.extract.kg.relationships.extract import Processor as RelationshipsProcessor
|
||||
from trustgraph.storage.knowledge.store import Processor as KnowledgeStoreProcessor
|
||||
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
|
||||
from trustgraph.schema import EntityContext, EntityContexts, GraphEmbeddings
|
||||
from trustgraph.schema import EntityContext, EntityContexts, GraphEmbeddings, EntityEmbeddings
|
||||
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
|
||||
|
||||
|
||||
|
|
@ -405,7 +405,12 @@ class TestKnowledgeGraphPipelineIntegration:
|
|||
collection="test_collection",
|
||||
metadata=[]
|
||||
),
|
||||
entities=[]
|
||||
entities=[
|
||||
EntityEmbeddings(
|
||||
entity=Term(type=IRI, iri="http://example.org/entity"),
|
||||
vectors=[[0.1, 0.2, 0.3]]
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
mock_msg = MagicMock()
|
||||
|
|
@ -496,12 +501,12 @@ class TestKnowledgeGraphPipelineIntegration:
|
|||
await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context)
|
||||
|
||||
# Assert
|
||||
# Should still call producers but with empty results
|
||||
# Should NOT call producers with empty results (avoids Cassandra NULL issues)
|
||||
triples_producer = mock_flow_context("triples")
|
||||
entity_contexts_producer = mock_flow_context("entity-contexts")
|
||||
|
||||
triples_producer.send.assert_called_once()
|
||||
entity_contexts_producer.send.assert_called_once()
|
||||
triples_producer.send.assert_not_called()
|
||||
entity_contexts_producer.send.assert_not_called()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_invalid_extraction_format_handling(self, definitions_processor, mock_flow_context, sample_chunk):
|
||||
|
|
|
|||
|
|
@ -73,6 +73,7 @@ class Processor(FlowProcessor):
|
|||
)
|
||||
)
|
||||
|
||||
if entities:
|
||||
r = GraphEmbeddings(
|
||||
metadata=v.metadata,
|
||||
entities=entities,
|
||||
|
|
|
|||
|
|
@ -168,6 +168,7 @@ class Processor(FlowProcessor):
|
|||
|
||||
entities.append(ec)
|
||||
|
||||
if triples:
|
||||
await self.emit_triples(
|
||||
flow("triples"),
|
||||
Metadata(
|
||||
|
|
@ -179,6 +180,7 @@ class Processor(FlowProcessor):
|
|||
triples
|
||||
)
|
||||
|
||||
if entities:
|
||||
await self.emit_ecs(
|
||||
flow("entity-contexts"),
|
||||
Metadata(
|
||||
|
|
|
|||
|
|
@ -282,17 +282,6 @@ class Processor(FlowProcessor):
|
|||
|
||||
if not ontology_subsets:
|
||||
logger.warning("No relevant ontology elements found for chunk")
|
||||
# Emit empty outputs
|
||||
await self.emit_triples(
|
||||
flow("triples"),
|
||||
v.metadata,
|
||||
[]
|
||||
)
|
||||
await self.emit_entity_contexts(
|
||||
flow("entity-contexts"),
|
||||
v.metadata,
|
||||
[]
|
||||
)
|
||||
return
|
||||
|
||||
# Merge subsets if multiple ontologies matched
|
||||
|
|
@ -327,6 +316,7 @@ class Processor(FlowProcessor):
|
|||
entity_contexts = self.build_entity_contexts(all_triples)
|
||||
|
||||
# Emit all triples (extracted + ontology definitions)
|
||||
if all_triples:
|
||||
await self.emit_triples(
|
||||
flow("triples"),
|
||||
v.metadata,
|
||||
|
|
@ -334,6 +324,7 @@ class Processor(FlowProcessor):
|
|||
)
|
||||
|
||||
# Emit entity contexts
|
||||
if entity_contexts:
|
||||
await self.emit_entity_contexts(
|
||||
flow("entity-contexts"),
|
||||
v.metadata,
|
||||
|
|
@ -345,17 +336,6 @@ class Processor(FlowProcessor):
|
|||
|
||||
except Exception as e:
|
||||
logger.error(f"OntoRAG extraction exception: {e}", exc_info=True)
|
||||
# Emit empty outputs on error
|
||||
await self.emit_triples(
|
||||
flow("triples"),
|
||||
v.metadata,
|
||||
[]
|
||||
)
|
||||
await self.emit_entity_contexts(
|
||||
flow("entity-contexts"),
|
||||
v.metadata,
|
||||
[]
|
||||
)
|
||||
|
||||
async def extract_with_simplified_format(
|
||||
self,
|
||||
|
|
|
|||
|
|
@ -181,6 +181,7 @@ class Processor(FlowProcessor):
|
|||
o=Term(type=IRI, iri=v.metadata.id)
|
||||
))
|
||||
|
||||
if triples:
|
||||
await self.emit_triples(
|
||||
flow("triples"),
|
||||
Metadata(
|
||||
|
|
|
|||
|
|
@ -64,11 +64,13 @@ class Processor(FlowProcessor):
|
|||
async def on_triples(self, msg, consumer, flow):
|
||||
|
||||
v = msg.value()
|
||||
if v.triples:
|
||||
await self.table_store.add_triples(v)
|
||||
|
||||
async def on_graph_embeddings(self, msg, consumer, flow):
|
||||
|
||||
v = msg.value()
|
||||
if v.entities:
|
||||
await self.table_store.add_graph_embeddings(v)
|
||||
|
||||
@staticmethod
|
||||
|
|
|
|||
|
|
@ -435,6 +435,7 @@ class KnowledgeTableStore:
|
|||
else:
|
||||
metadata = []
|
||||
|
||||
if row[3]:
|
||||
triples = [
|
||||
Triple(
|
||||
s = tuple_to_term(elt[0], elt[1]),
|
||||
|
|
@ -443,6 +444,8 @@ class KnowledgeTableStore:
|
|||
)
|
||||
for elt in row[3]
|
||||
]
|
||||
else:
|
||||
triples = []
|
||||
|
||||
await receiver(
|
||||
Triples(
|
||||
|
|
@ -491,6 +494,7 @@ class KnowledgeTableStore:
|
|||
else:
|
||||
metadata = []
|
||||
|
||||
if row[3]:
|
||||
entities = [
|
||||
EntityEmbeddings(
|
||||
entity = tuple_to_term(ent[0][0], ent[0][1]),
|
||||
|
|
@ -498,6 +502,8 @@ class KnowledgeTableStore:
|
|||
)
|
||||
for ent in row[3]
|
||||
]
|
||||
else:
|
||||
entities = []
|
||||
|
||||
await receiver(
|
||||
GraphEmbeddings(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue