mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-06-03 03:45:13 +02:00
Remove schema:subjectOf edges from KG extraction (#695)
The subjectOf triples were redundant with the subgraph provenance model
introduced in e8407b34. Entity-to-source lineage can be traced via
tg:contains -> subgraph -> prov:wasDerivedFrom -> chunk, making the
direct subjectOf edges unnecessary metadata polluting the knowledge graph.
Removed from all three extractors (agent, definitions, relationships),
cleaned up the SUBJECT_OF constant and vocabulary label, and updated
tests accordingly.
This commit is contained in:
parent
64e3f6bd0d
commit
e6623fc915
10 changed files with 9 additions and 88 deletions
|
|
@ -14,7 +14,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
|
|||
from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
|
||||
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
|
||||
from trustgraph.schema import EntityContext, EntityContexts, AgentRequest, AgentResponse
|
||||
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
|
||||
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
|
||||
from trustgraph.template.prompt_manager import PromptManager
|
||||
|
||||
|
||||
|
|
@ -174,10 +174,6 @@ class TestAgentKgExtractionIntegration:
|
|||
label_triples = [t for t in sent_triples.triples if t.p.iri == RDF_LABEL]
|
||||
assert len(label_triples) >= 2 # Should have labels for entities
|
||||
|
||||
# Check subject-of relationships
|
||||
subject_of_triples = [t for t in sent_triples.triples if t.p.iri == SUBJECT_OF]
|
||||
assert len(subject_of_triples) >= 2 # Entities should be linked to document
|
||||
|
||||
# Verify entity contexts were emitted
|
||||
entity_contexts_publisher = mock_flow_context("entity-contexts")
|
||||
entity_contexts_publisher.send.assert_called_once()
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ from trustgraph.extract.kg.relationships.extract import Processor as Relationshi
|
|||
from trustgraph.storage.knowledge.store import Processor as KnowledgeStoreProcessor
|
||||
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
|
||||
from trustgraph.schema import EntityContext, EntityContexts, GraphEmbeddings, EntityEmbeddings
|
||||
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
|
||||
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
|
|||
from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
|
||||
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
|
||||
from trustgraph.schema import EntityContext, EntityContexts
|
||||
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
|
||||
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
|
||||
from trustgraph.template.prompt_manager import PromptManager
|
||||
|
||||
|
||||
|
|
@ -183,12 +183,6 @@ This is not JSON at all
|
|||
assert def_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
||||
assert def_triple.o.value == "A subset of AI that enables learning from data."
|
||||
|
||||
# Check subject-of triple
|
||||
subject_of_triple = next((t for t in triples if t.p.iri == SUBJECT_OF), None)
|
||||
assert subject_of_triple is not None
|
||||
assert subject_of_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
||||
assert subject_of_triple.o.iri == "doc123"
|
||||
|
||||
# Check entity context
|
||||
assert len(entity_contexts) == 1
|
||||
assert entity_contexts[0].entity.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
||||
|
|
@ -228,10 +222,6 @@ This is not JSON at all
|
|||
assert rel_triple.o.iri == object_uri
|
||||
assert rel_triple.o.type == IRI
|
||||
|
||||
# Check subject-of relationships
|
||||
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF and t.o.iri == "doc123"]
|
||||
assert len(subject_of_triples) >= 2 # At least subject and predicate should have subject-of relations
|
||||
|
||||
def test_process_extraction_data_literal_object(self, agent_extractor, sample_metadata):
|
||||
"""Test processing of relationships with literal objects"""
|
||||
data = [
|
||||
|
|
@ -274,10 +264,6 @@ This is not JSON at all
|
|||
|
||||
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, metadata)
|
||||
|
||||
# Should not create subject-of relationships when no metadata ID
|
||||
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
|
||||
assert len(subject_of_triples) == 0
|
||||
|
||||
# Should still create entity contexts
|
||||
assert len(entity_contexts) == 1
|
||||
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ from unittest.mock import AsyncMock, MagicMock
|
|||
from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
|
||||
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
|
||||
from trustgraph.schema import EntityContext, EntityContexts
|
||||
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
|
||||
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
|
|
@ -187,10 +187,6 @@ class TestAgentKgExtractionEdgeCases:
|
|||
data = [{"type": "definition", "entity": "Test", "definition": "Test def"}]
|
||||
triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
|
||||
|
||||
# Should not create subject-of triples when ID is empty string
|
||||
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
|
||||
assert len(subject_of_triples) == 0
|
||||
|
||||
def test_process_extraction_data_special_entity_names(self, agent_extractor):
|
||||
"""Test processing with special characters in entity names"""
|
||||
metadata = Metadata(id="doc123")
|
||||
|
|
@ -338,7 +334,7 @@ class TestAgentKgExtractionEdgeCases:
|
|||
|
||||
# Should process all relationships
|
||||
# Note: The current implementation has some logic issues that these tests document
|
||||
assert len([t for t in triples if t.p.iri != RDF_LABEL and t.p.iri != SUBJECT_OF]) >= 7
|
||||
assert len([t for t in triples if t.p.iri != RDF_LABEL]) >= 7
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_emit_empty_collections(self, agent_extractor):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue