Remove schema:subjectOf edges from KG extraction (#695)

The subjectOf triples were redundant with the subgraph provenance model introduced in e8407b34. Entity-to-source lineage can be traced via tg:contains -> subgraph -> prov:wasDerivedFrom -> chunk, making the direct subjectOf edges unnecessary metadata polluting the knowledge graph. Removed from all three extractors (agent, definitions, relationships), cleaned up the SUBJECT_OF constant and vocabulary label, and updated tests accordingly.
2026-04-25 00:16:23 +02:00 · 2026-03-13 12:11:21 +00:00 · 2026-03-13 12:11:21 +00:00 · e6623fc915
commit e6623fc915
parent 64e3f6bd0d
10 changed files with 9 additions and 88 deletions
--- a/trustgraph-flow/trustgraph/extract/kg/agent/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/agent/extract.py
@ -6,7 +6,7 @@ import logging
 from ....schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
 from ....schema import EntityContext, EntityContexts

-from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, SUBJECT_OF, DEFINITION
+from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, DEFINITION

 from ....base import FlowProcessor, ConsumerSpec, ProducerSpec
 from ....base import AgentClientSpec
@ -269,14 +269,6 @@ class Processor(FlowProcessor):
            triples.append(definition_triple)
            extracted_triples.append(definition_triple)

-            # Add subject-of relationship to document
-            if metadata.id:
-                triples.append(Triple(
-                    s = Term(type=IRI, iri=entity_uri),
-                    p = Term(type=IRI, iri=SUBJECT_OF),
-                    o = Term(type=IRI, iri=metadata.id),
-                ))
-
            # Create entity context for embeddings
            entity_contexts.append(EntityContext(
                entity=Term(type=IRI, iri=entity_uri),
@ -327,27 +319,6 @@ class Processor(FlowProcessor):
            triples.append(relationship_triple)
            extracted_triples.append(relationship_triple)

-            # Add subject-of relationships to document
-            if metadata.id:
-                triples.append(Triple(
-                    s = subject_value,
-                    p = Term(type=IRI, iri=SUBJECT_OF),
-                    o = Term(type=IRI, iri=metadata.id),
-                ))
-
-                triples.append(Triple(
-                    s = predicate_value,
-                    p = Term(type=IRI, iri=SUBJECT_OF),
-                    o = Term(type=IRI, iri=metadata.id),
-                ))
-
-                if rel.get("object-entity", True):
-                    triples.append(Triple(
-                        s = object_value,
-                        p = Term(type=IRI, iri=SUBJECT_OF),
-                        o = Term(type=IRI, iri=metadata.id),
-                    ))
-
        return triples, entity_contexts, extracted_triples

    @staticmethod
--- a/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py
@ -15,7 +15,7 @@ from .... schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
 logger = logging.getLogger(__name__)
 from .... schema import EntityContext, EntityContexts
 from .... schema import PromptRequest, PromptResponse
-from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
+from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL

 from .... base import FlowProcessor, ConsumerSpec,  ProducerSpec
 from .... base import PromptClientSpec, ParameterSpec
@ -25,8 +25,6 @@ from .... flow_version import __version__ as COMPONENT_VERSION

 DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION)
 RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
-SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF)
-
 default_ident = "kg-extract-definitions"
 default_concurrency = 1
 default_triples_batch_size = 50
@ -176,13 +174,6 @@ class Processor(FlowProcessor):
                triples.append(definition_triple)
                extracted_triples.append(definition_triple)

-                # Link entity to chunk (not top-level document)
-                triples.append(Triple(
-                    s=s_value,
-                    p=SUBJECT_OF_VALUE,
-                    o=Term(type=IRI, iri=chunk_uri)
-                ))
-
                # Output entity name as context for direct name matching
                # Include chunk_id for embedding provenance
                entities.append(EntityContext(
--- a/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py
@ -15,7 +15,7 @@ logger = logging.getLogger(__name__)
 from .... schema import Chunk, Triple, Triples
 from .... schema import Metadata, Term, IRI, LITERAL
 from .... schema import PromptRequest, PromptResponse
-from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
+from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES

 from .... base import FlowProcessor, ConsumerSpec,  ProducerSpec
 from .... base import PromptClientSpec, ParameterSpec
@ -24,7 +24,6 @@ from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph
 from .... flow_version import __version__ as COMPONENT_VERSION

 RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
-SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF)

 default_ident = "kg-extract-relationships"
 default_concurrency = 1
@ -185,21 +184,6 @@ class Processor(FlowProcessor):
                        o=Term(type=LITERAL, value=str(o))
                    ))

-                # Link entity to chunk (not top-level document)
-                triples.append(Triple(
-                    s=s_value,
-                    p=SUBJECT_OF_VALUE,
-                    o=Term(type=IRI, iri=chunk_uri)
-                ))
-
-                if rel["object-entity"]:
-                    # Link object entity to chunk
-                    triples.append(Triple(
-                        s=o_value,
-                        p=SUBJECT_OF_VALUE,
-                        o=Term(type=IRI, iri=chunk_uri)
-                    ))
-
            # Generate subgraph provenance once for all extracted triples
            if extracted_triples:
                sg_uri = subgraph_uri()