From e6623fc9153e5af420ef480f88c2ccdc8e6ab4ff Mon Sep 17 00:00:00 2001
From: cybermaggedon <cybermaggedon@gmail.com>
Date: Fri, 13 Mar 2026 12:11:21 +0000
Subject: [PATCH] Remove schema:subjectOf edges from KG extraction (#695)

The subjectOf triples were redundant with the subgraph provenance model
introduced in e8407b34. Entity-to-source lineage can be traced via
tg:contains -> subgraph -> prov:wasDerivedFrom -> chunk, making the
direct subjectOf edges unnecessary metadata polluting the knowledge graph.

Removed from all three extractors (agent, definitions, relationships),
cleaned up the SUBJECT_OF constant and vocabulary label, and updated
tests accordingly.
---
 .../test_agent_kg_extraction_integration.py   |  6 +---
 .../test_kg_extract_store_integration.py      |  2 +-
 .../test_agent_extraction.py                  | 16 +---------
 .../test_agent_extraction_edge_cases.py       |  8 ++---
 .../trustgraph/provenance/namespaces.py       |  1 -
 .../trustgraph/provenance/vocabulary.py       |  3 +-
 trustgraph-base/trustgraph/rdf.py             |  1 -
 .../trustgraph/extract/kg/agent/extract.py    | 31 +------------------
 .../extract/kg/definitions/extract.py         | 11 +------
 .../extract/kg/relationships/extract.py       | 18 +----------
 10 files changed, 9 insertions(+), 88 deletions(-)

diff --git a/tests/integration/test_agent_kg_extraction_integration.py b/tests/integration/test_agent_kg_extraction_integration.py
index a2576274..579498db 100644
--- a/tests/integration/test_agent_kg_extraction_integration.py
+++ b/tests/integration/test_agent_kg_extraction_integration.py
@@ -14,7 +14,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
 from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
 from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
 from trustgraph.schema import EntityContext, EntityContexts, AgentRequest, AgentResponse
-from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
+from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
 from trustgraph.template.prompt_manager import PromptManager
 
 
@@ -174,10 +174,6 @@ class TestAgentKgExtractionIntegration:
         label_triples = [t for t in sent_triples.triples if t.p.iri == RDF_LABEL]
         assert len(label_triples) >= 2  # Should have labels for entities
 
-        # Check subject-of relationships
-        subject_of_triples = [t for t in sent_triples.triples if t.p.iri == SUBJECT_OF]
-        assert len(subject_of_triples) >= 2  # Entities should be linked to document
-
         # Verify entity contexts were emitted
         entity_contexts_publisher = mock_flow_context("entity-contexts")
         entity_contexts_publisher.send.assert_called_once()
diff --git a/tests/integration/test_kg_extract_store_integration.py b/tests/integration/test_kg_extract_store_integration.py
index 56c30144..4d8b60ad 100644
--- a/tests/integration/test_kg_extract_store_integration.py
+++ b/tests/integration/test_kg_extract_store_integration.py
@@ -17,7 +17,7 @@ from trustgraph.extract.kg.relationships.extract import Processor as Relationshi
 from trustgraph.storage.knowledge.store import Processor as KnowledgeStoreProcessor
 from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
 from trustgraph.schema import EntityContext, EntityContexts, GraphEmbeddings, EntityEmbeddings
-from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
+from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
 
 
 @pytest.mark.integration
diff --git a/tests/unit/test_knowledge_graph/test_agent_extraction.py b/tests/unit/test_knowledge_graph/test_agent_extraction.py
index d2824c0c..ec985e3b 100644
--- a/tests/unit/test_knowledge_graph/test_agent_extraction.py
+++ b/tests/unit/test_knowledge_graph/test_agent_extraction.py
@@ -13,7 +13,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
 from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
 from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
 from trustgraph.schema import EntityContext, EntityContexts
-from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
+from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
 from trustgraph.template.prompt_manager import PromptManager
 
 
@@ -183,12 +183,6 @@ This is not JSON at all
         assert def_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
         assert def_triple.o.value == "A subset of AI that enables learning from data."
 
-        # Check subject-of triple
-        subject_of_triple = next((t for t in triples if t.p.iri == SUBJECT_OF), None)
-        assert subject_of_triple is not None
-        assert subject_of_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
-        assert subject_of_triple.o.iri == "doc123"
-
         # Check entity context
         assert len(entity_contexts) == 1
         assert entity_contexts[0].entity.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
@@ -228,10 +222,6 @@ This is not JSON at all
         assert rel_triple.o.iri == object_uri
         assert rel_triple.o.type == IRI
 
-        # Check subject-of relationships
-        subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF and t.o.iri == "doc123"]
-        assert len(subject_of_triples) >= 2  # At least subject and predicate should have subject-of relations
-
     def test_process_extraction_data_literal_object(self, agent_extractor, sample_metadata):
         """Test processing of relationships with literal objects"""
         data = [
@@ -274,10 +264,6 @@ This is not JSON at all
 
         triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, metadata)
 
-        # Should not create subject-of relationships when no metadata ID
-        subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
-        assert len(subject_of_triples) == 0
-
         # Should still create entity contexts
         assert len(entity_contexts) == 1
 
diff --git a/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py b/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py
index ac20fe11..b0be3f06 100644
--- a/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py
+++ b/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py
@@ -13,7 +13,7 @@ from unittest.mock import AsyncMock, MagicMock
 from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
 from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
 from trustgraph.schema import EntityContext, EntityContexts
-from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
+from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
 
 
 @pytest.mark.unit
@@ -187,10 +187,6 @@ class TestAgentKgExtractionEdgeCases:
         data = [{"type": "definition", "entity": "Test", "definition": "Test def"}]
         triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
 
-        # Should not create subject-of triples when ID is empty string
-        subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
-        assert len(subject_of_triples) == 0
-
     def test_process_extraction_data_special_entity_names(self, agent_extractor):
         """Test processing with special characters in entity names"""
         metadata = Metadata(id="doc123")
@@ -338,7 +334,7 @@ class TestAgentKgExtractionEdgeCases:
 
         # Should process all relationships
         # Note: The current implementation has some logic issues that these tests document
-        assert len([t for t in triples if t.p.iri != RDF_LABEL and t.p.iri != SUBJECT_OF]) >= 7
+        assert len([t for t in triples if t.p.iri != RDF_LABEL]) >= 7
 
     @pytest.mark.asyncio
     async def test_emit_empty_collections(self, agent_extractor):
diff --git a/trustgraph-base/trustgraph/provenance/namespaces.py b/trustgraph-base/trustgraph/provenance/namespaces.py
index e60dee16..4c1ab7bf 100644
--- a/trustgraph-base/trustgraph/provenance/namespaces.py
+++ b/trustgraph-base/trustgraph/provenance/namespaces.py
@@ -30,7 +30,6 @@ RDFS_LABEL = RDFS + "label"
 
 # Schema.org namespace
 SCHEMA = "https://schema.org/"
-SCHEMA_SUBJECT_OF = SCHEMA + "subjectOf"
 SCHEMA_DIGITAL_DOCUMENT = SCHEMA + "DigitalDocument"
 SCHEMA_DESCRIPTION = SCHEMA + "description"
 SCHEMA_KEYWORDS = SCHEMA + "keywords"
diff --git a/trustgraph-base/trustgraph/provenance/vocabulary.py b/trustgraph-base/trustgraph/provenance/vocabulary.py
index f9ae1abd..4ad2e59b 100644
--- a/trustgraph-base/trustgraph/provenance/vocabulary.py
+++ b/trustgraph-base/trustgraph/provenance/vocabulary.py
@@ -16,7 +16,7 @@ from . namespaces import (
     PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
     PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
     DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
-    SCHEMA_SUBJECT_OF, SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION,
+    SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION,
     SCHEMA_KEYWORDS, SCHEMA_NAME,
     SKOS_DEFINITION,
     TG_CONTAINS, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
@@ -63,7 +63,6 @@ DC_PREDICATE_LABELS = [
 
 # Schema.org labels
 SCHEMA_LABELS = [
-    _label_triple(SCHEMA_SUBJECT_OF, "subject of"),
     _label_triple(SCHEMA_DIGITAL_DOCUMENT, "Digital Document"),
     _label_triple(SCHEMA_DESCRIPTION, "description"),
     _label_triple(SCHEMA_KEYWORDS, "keywords"),
diff --git a/trustgraph-base/trustgraph/rdf.py b/trustgraph-base/trustgraph/rdf.py
index 32799b8d..1d3b7cba 100644
--- a/trustgraph-base/trustgraph/rdf.py
+++ b/trustgraph-base/trustgraph/rdf.py
@@ -2,7 +2,6 @@
 RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
 RDF_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
 DEFINITION = "http://www.w3.org/2004/02/skos/core#definition"
-SUBJECT_OF = "https://schema.org/subjectOf"
 
 TRUSTGRAPH_ENTITIES = "http://trustgraph.ai/e/"
 
diff --git a/trustgraph-flow/trustgraph/extract/kg/agent/extract.py b/trustgraph-flow/trustgraph/extract/kg/agent/extract.py
index 28dba11a..5ce343c6 100644
--- a/trustgraph-flow/trustgraph/extract/kg/agent/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/agent/extract.py
@@ -6,7 +6,7 @@ import logging
 from ....schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
 from ....schema import EntityContext, EntityContexts
 
-from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, SUBJECT_OF, DEFINITION
+from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, DEFINITION
 
 from ....base import FlowProcessor, ConsumerSpec, ProducerSpec
 from ....base import AgentClientSpec
@@ -269,14 +269,6 @@ class Processor(FlowProcessor):
             triples.append(definition_triple)
             extracted_triples.append(definition_triple)
 
-            # Add subject-of relationship to document
-            if metadata.id:
-                triples.append(Triple(
-                    s = Term(type=IRI, iri=entity_uri),
-                    p = Term(type=IRI, iri=SUBJECT_OF),
-                    o = Term(type=IRI, iri=metadata.id),
-                ))
-
             # Create entity context for embeddings
             entity_contexts.append(EntityContext(
                 entity=Term(type=IRI, iri=entity_uri),
@@ -327,27 +319,6 @@ class Processor(FlowProcessor):
             triples.append(relationship_triple)
             extracted_triples.append(relationship_triple)
 
-            # Add subject-of relationships to document
-            if metadata.id:
-                triples.append(Triple(
-                    s = subject_value,
-                    p = Term(type=IRI, iri=SUBJECT_OF),
-                    o = Term(type=IRI, iri=metadata.id),
-                ))
-
-                triples.append(Triple(
-                    s = predicate_value,
-                    p = Term(type=IRI, iri=SUBJECT_OF),
-                    o = Term(type=IRI, iri=metadata.id),
-                ))
-
-                if rel.get("object-entity", True):
-                    triples.append(Triple(
-                        s = object_value,
-                        p = Term(type=IRI, iri=SUBJECT_OF),
-                        o = Term(type=IRI, iri=metadata.id),
-                    ))
-
         return triples, entity_contexts, extracted_triples
 
     @staticmethod
diff --git a/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py b/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py
index 44ea778d..2bb88c8a 100755
--- a/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py
@@ -15,7 +15,7 @@ from .... schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
 logger = logging.getLogger(__name__)
 from .... schema import EntityContext, EntityContexts
 from .... schema import PromptRequest, PromptResponse
-from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
+from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
 
 from .... base import FlowProcessor, ConsumerSpec,  ProducerSpec
 from .... base import PromptClientSpec, ParameterSpec
@@ -25,8 +25,6 @@ from .... flow_version import __version__ as COMPONENT_VERSION
 
 DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION)
 RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
-SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF)
-
 default_ident = "kg-extract-definitions"
 default_concurrency = 1
 default_triples_batch_size = 50
@@ -176,13 +174,6 @@ class Processor(FlowProcessor):
                 triples.append(definition_triple)
                 extracted_triples.append(definition_triple)
 
-                # Link entity to chunk (not top-level document)
-                triples.append(Triple(
-                    s=s_value,
-                    p=SUBJECT_OF_VALUE,
-                    o=Term(type=IRI, iri=chunk_uri)
-                ))
-
                 # Output entity name as context for direct name matching
                 # Include chunk_id for embedding provenance
                 entities.append(EntityContext(
diff --git a/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py b/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py
index 604de1df..b557ec32 100755
--- a/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py
@@ -15,7 +15,7 @@ logger = logging.getLogger(__name__)
 from .... schema import Chunk, Triple, Triples
 from .... schema import Metadata, Term, IRI, LITERAL
 from .... schema import PromptRequest, PromptResponse
-from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
+from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES
 
 from .... base import FlowProcessor, ConsumerSpec,  ProducerSpec
 from .... base import PromptClientSpec, ParameterSpec
@@ -24,7 +24,6 @@ from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph
 from .... flow_version import __version__ as COMPONENT_VERSION
 
 RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
-SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF)
 
 default_ident = "kg-extract-relationships"
 default_concurrency = 1
@@ -185,21 +184,6 @@ class Processor(FlowProcessor):
                         o=Term(type=LITERAL, value=str(o))
                     ))
 
-                # Link entity to chunk (not top-level document)
-                triples.append(Triple(
-                    s=s_value,
-                    p=SUBJECT_OF_VALUE,
-                    o=Term(type=IRI, iri=chunk_uri)
-                ))
-
-                if rel["object-entity"]:
-                    # Link object entity to chunk
-                    triples.append(Triple(
-                        s=o_value,
-                        p=SUBJECT_OF_VALUE,
-                        o=Term(type=IRI, iri=chunk_uri)
-                    ))
-
             # Generate subgraph provenance once for all extracted triples
             if extracted_triples:
                 sg_uri = subgraph_uri()