Remove schema:subjectOf edges from KG extraction (#695)

The subjectOf triples were redundant with the subgraph provenance model introduced in e8407b34. Entity-to-source lineage can be traced via tg:contains -> subgraph -> prov:wasDerivedFrom -> chunk, making the direct subjectOf edges unnecessary metadata polluting the knowledge graph. Removed from all three extractors (agent, definitions, relationships), cleaned up the SUBJECT_OF constant and vocabulary label, and updated tests accordingly.
2026-04-27 17:36:23 +02:00 · 2026-03-13 12:11:21 +00:00 · 2026-03-13 12:11:21 +00:00 · e6623fc915
commit e6623fc915
parent 64e3f6bd0d
10 changed files with 9 additions and 88 deletions
--- a/tests/integration/test_agent_kg_extraction_integration.py
+++ b/tests/integration/test_agent_kg_extraction_integration.py
@ -14,7 +14,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
 from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
 from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
 from trustgraph.schema import EntityContext, EntityContexts, AgentRequest, AgentResponse
-from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
+from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
 from trustgraph.template.prompt_manager import PromptManager
@ -174,10 +174,6 @@ class TestAgentKgExtractionIntegration:
        label_triples = [t for t in sent_triples.triples if t.p.iri == RDF_LABEL]
        assert len(label_triples) >= 2  # Should have labels for entities
        # Check subject-of relationships
        subject_of_triples = [t for t in sent_triples.triples if t.p.iri == SUBJECT_OF]
        assert len(subject_of_triples) >= 2  # Entities should be linked to document
        # Verify entity contexts were emitted
        entity_contexts_publisher = mock_flow_context("entity-contexts")
        entity_contexts_publisher.send.assert_called_once()
--- a/tests/integration/test_kg_extract_store_integration.py
+++ b/tests/integration/test_kg_extract_store_integration.py
@ -17,7 +17,7 @@ from trustgraph.extract.kg.relationships.extract import Processor as Relationshi
 from trustgraph.storage.knowledge.store import Processor as KnowledgeStoreProcessor
 from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
 from trustgraph.schema import EntityContext, EntityContexts, GraphEmbeddings, EntityEmbeddings
-from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
+from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
@pytest.mark.integration
--- a/tests/unit/test_knowledge_graph/test_agent_extraction.py
+++ b/tests/unit/test_knowledge_graph/test_agent_extraction.py
@ -13,7 +13,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
 from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
 from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
 from trustgraph.schema import EntityContext, EntityContexts
-from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
+from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
 from trustgraph.template.prompt_manager import PromptManager
@ -183,12 +183,6 @@ This is not JSON at all
        assert def_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
        assert def_triple.o.value == "A subset of AI that enables learning from data."
        # Check subject-of triple
        subject_of_triple = next((t for t in triples if t.p.iri == SUBJECT_OF), None)
        assert subject_of_triple is not None
        assert subject_of_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
        assert subject_of_triple.o.iri == "doc123"
        # Check entity context
        assert len(entity_contexts) == 1
        assert entity_contexts[0].entity.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
@ -228,10 +222,6 @@ This is not JSON at all
        assert rel_triple.o.iri == object_uri
        assert rel_triple.o.type == IRI
        # Check subject-of relationships
        subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF and t.o.iri == "doc123"]
        assert len(subject_of_triples) >= 2  # At least subject and predicate should have subject-of relations
    def test_process_extraction_data_literal_object(self, agent_extractor, sample_metadata):
        """Test processing of relationships with literal objects"""
        data = [
@ -274,10 +264,6 @@ This is not JSON at all
        triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, metadata)
        # Should not create subject-of relationships when no metadata ID
        subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
        assert len(subject_of_triples) == 0
        # Should still create entity contexts
        assert len(entity_contexts) == 1
--- a/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py
+++ b/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py
@ -13,7 +13,7 @@ from unittest.mock import AsyncMock, MagicMock
 from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
 from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
 from trustgraph.schema import EntityContext, EntityContexts
-from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
+from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
@pytest.mark.unit
@ -187,10 +187,6 @@ class TestAgentKgExtractionEdgeCases:
        data = [{"type": "definition", "entity": "Test", "definition": "Test def"}]
        triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
        # Should not create subject-of triples when ID is empty string
        subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
        assert len(subject_of_triples) == 0
    def test_process_extraction_data_special_entity_names(self, agent_extractor):
        """Test processing with special characters in entity names"""
        metadata = Metadata(id="doc123")
@ -338,7 +334,7 @@ class TestAgentKgExtractionEdgeCases:
        # Should process all relationships
        # Note: The current implementation has some logic issues that these tests document
-        assert len([t for t in triples if t.p.iri != RDF_LABEL and t.p.iri != SUBJECT_OF]) >= 7
+        assert len([t for t in triples if t.p.iri != RDF_LABEL]) >= 7
    @pytest.mark.asyncio
    async def test_emit_empty_collections(self, agent_extractor):
--- a/trustgraph-base/trustgraph/provenance/namespaces.py
+++ b/trustgraph-base/trustgraph/provenance/namespaces.py
@ -30,7 +30,6 @@ RDFS_LABEL = RDFS + "label"
 # Schema.org namespace
 SCHEMA = "https://schema.org/"
 SCHEMA_SUBJECT_OF = SCHEMA + "subjectOf"
 SCHEMA_DIGITAL_DOCUMENT = SCHEMA + "DigitalDocument"
 SCHEMA_DESCRIPTION = SCHEMA + "description"
 SCHEMA_KEYWORDS = SCHEMA + "keywords"
--- a/trustgraph-base/trustgraph/provenance/vocabulary.py
+++ b/trustgraph-base/trustgraph/provenance/vocabulary.py
@ -16,7 +16,7 @@ from . namespaces import (
    PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
    PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
    DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
-    SCHEMA_SUBJECT_OF, SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION,
+    SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION,
    SCHEMA_KEYWORDS, SCHEMA_NAME,
    SKOS_DEFINITION,
    TG_CONTAINS, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
@ -63,7 +63,6 @@ DC_PREDICATE_LABELS = [
 # Schema.org labels
 SCHEMA_LABELS = [
    _label_triple(SCHEMA_SUBJECT_OF, "subject of"),
    _label_triple(SCHEMA_DIGITAL_DOCUMENT, "Digital Document"),
    _label_triple(SCHEMA_DESCRIPTION, "description"),
    _label_triple(SCHEMA_KEYWORDS, "keywords"),
--- a/trustgraph-base/trustgraph/rdf.py
+++ b/trustgraph-base/trustgraph/rdf.py
@ -2,7 +2,6 @@
 RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
 RDF_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
 DEFINITION = "http://www.w3.org/2004/02/skos/core#definition"
 SUBJECT_OF = "https://schema.org/subjectOf"
 TRUSTGRAPH_ENTITIES = "http://trustgraph.ai/e/"
--- a/trustgraph-flow/trustgraph/extract/kg/agent/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/agent/extract.py
@ -6,7 +6,7 @@ import logging
 from ....schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
 from ....schema import EntityContext, EntityContexts
-from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, SUBJECT_OF, DEFINITION
+from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, DEFINITION
 from ....base import FlowProcessor, ConsumerSpec, ProducerSpec
 from ....base import AgentClientSpec
@ -269,14 +269,6 @@ class Processor(FlowProcessor):
            triples.append(definition_triple)
            extracted_triples.append(definition_triple)
            # Add subject-of relationship to document
            if metadata.id:
                triples.append(Triple(
                    s = Term(type=IRI, iri=entity_uri),
                    p = Term(type=IRI, iri=SUBJECT_OF),
                    o = Term(type=IRI, iri=metadata.id),
                ))
            # Create entity context for embeddings
            entity_contexts.append(EntityContext(
                entity=Term(type=IRI, iri=entity_uri),
@ -327,27 +319,6 @@ class Processor(FlowProcessor):
            triples.append(relationship_triple)
            extracted_triples.append(relationship_triple)
            # Add subject-of relationships to document
            if metadata.id:
                triples.append(Triple(
                    s = subject_value,
                    p = Term(type=IRI, iri=SUBJECT_OF),
                    o = Term(type=IRI, iri=metadata.id),
                ))
                triples.append(Triple(
                    s = predicate_value,
                    p = Term(type=IRI, iri=SUBJECT_OF),
                    o = Term(type=IRI, iri=metadata.id),
                ))
                if rel.get("object-entity", True):
                    triples.append(Triple(
                        s = object_value,
                        p = Term(type=IRI, iri=SUBJECT_OF),
                        o = Term(type=IRI, iri=metadata.id),
                    ))
        return triples, entity_contexts, extracted_triples
    @staticmethod
--- a/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py
@ -15,7 +15,7 @@ from .... schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
 logger = logging.getLogger(__name__)
 from .... schema import EntityContext, EntityContexts
 from .... schema import PromptRequest, PromptResponse
-from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
+from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
 from .... base import FlowProcessor, ConsumerSpec,  ProducerSpec
 from .... base import PromptClientSpec, ParameterSpec
@ -25,8 +25,6 @@ from .... flow_version import __version__ as COMPONENT_VERSION
 DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION)
 RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
 SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF)
 default_ident = "kg-extract-definitions"
 default_concurrency = 1
 default_triples_batch_size = 50
@ -176,13 +174,6 @@ class Processor(FlowProcessor):
                triples.append(definition_triple)
                extracted_triples.append(definition_triple)
                # Link entity to chunk (not top-level document)
                triples.append(Triple(
                    s=s_value,
                    p=SUBJECT_OF_VALUE,
                    o=Term(type=IRI, iri=chunk_uri)
                ))
                # Output entity name as context for direct name matching
                # Include chunk_id for embedding provenance
                entities.append(EntityContext(
--- a/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py
@ -15,7 +15,7 @@ logger = logging.getLogger(__name__)
 from .... schema import Chunk, Triple, Triples
 from .... schema import Metadata, Term, IRI, LITERAL
 from .... schema import PromptRequest, PromptResponse
-from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
+from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES
 from .... base import FlowProcessor, ConsumerSpec,  ProducerSpec
 from .... base import PromptClientSpec, ParameterSpec
@ -24,7 +24,6 @@ from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph
 from .... flow_version import __version__ as COMPONENT_VERSION
 RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
 SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF)
 default_ident = "kg-extract-relationships"
 default_concurrency = 1
@ -185,21 +184,6 @@ class Processor(FlowProcessor):
                        o=Term(type=LITERAL, value=str(o))
                    ))
                # Link entity to chunk (not top-level document)
                triples.append(Triple(
                    s=s_value,
                    p=SUBJECT_OF_VALUE,
                    o=Term(type=IRI, iri=chunk_uri)
                ))
                if rel["object-entity"]:
                    # Link object entity to chunk
                    triples.append(Triple(
                        s=o_value,
                        p=SUBJECT_OF_VALUE,
                        o=Term(type=IRI, iri=chunk_uri)
                    ))
            # Generate subgraph provenance once for all extracted triples
            if extracted_triples:
                sg_uri = subgraph_uri()