Remove schema:subjectOf edges from KG extraction (#695)

The subjectOf triples were redundant with the subgraph provenance model
introduced in e8407b34. Entity-to-source lineage can be traced via
tg:contains -> subgraph -> prov:wasDerivedFrom -> chunk, making the
direct subjectOf edges unnecessary metadata polluting the knowledge graph.

Removed from all three extractors (agent, definitions, relationships),
cleaned up the SUBJECT_OF constant and vocabulary label, and updated
tests accordingly.
This commit is contained in:
cybermaggedon 2026-03-13 12:11:21 +00:00 committed by GitHub
parent 64e3f6bd0d
commit e6623fc915
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 9 additions and 88 deletions

View file

@ -6,7 +6,7 @@ import logging
from ....schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
from ....schema import EntityContext, EntityContexts
from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, SUBJECT_OF, DEFINITION
from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, DEFINITION
from ....base import FlowProcessor, ConsumerSpec, ProducerSpec
from ....base import AgentClientSpec
@ -269,14 +269,6 @@ class Processor(FlowProcessor):
triples.append(definition_triple)
extracted_triples.append(definition_triple)
# Add subject-of relationship to document
if metadata.id:
triples.append(Triple(
s = Term(type=IRI, iri=entity_uri),
p = Term(type=IRI, iri=SUBJECT_OF),
o = Term(type=IRI, iri=metadata.id),
))
# Create entity context for embeddings
entity_contexts.append(EntityContext(
entity=Term(type=IRI, iri=entity_uri),
@ -327,27 +319,6 @@ class Processor(FlowProcessor):
triples.append(relationship_triple)
extracted_triples.append(relationship_triple)
# Add subject-of relationships to document
if metadata.id:
triples.append(Triple(
s = subject_value,
p = Term(type=IRI, iri=SUBJECT_OF),
o = Term(type=IRI, iri=metadata.id),
))
triples.append(Triple(
s = predicate_value,
p = Term(type=IRI, iri=SUBJECT_OF),
o = Term(type=IRI, iri=metadata.id),
))
if rel.get("object-entity", True):
triples.append(Triple(
s = object_value,
p = Term(type=IRI, iri=SUBJECT_OF),
o = Term(type=IRI, iri=metadata.id),
))
return triples, entity_contexts, extracted_triples
@staticmethod

View file

@ -15,7 +15,7 @@ from .... schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
logger = logging.getLogger(__name__)
from .... schema import EntityContext, EntityContexts
from .... schema import PromptRequest, PromptResponse
from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
from .... base import PromptClientSpec, ParameterSpec
@ -25,8 +25,6 @@ from .... flow_version import __version__ as COMPONENT_VERSION
DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION)
RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF)
default_ident = "kg-extract-definitions"
default_concurrency = 1
default_triples_batch_size = 50
@ -176,13 +174,6 @@ class Processor(FlowProcessor):
triples.append(definition_triple)
extracted_triples.append(definition_triple)
# Link entity to chunk (not top-level document)
triples.append(Triple(
s=s_value,
p=SUBJECT_OF_VALUE,
o=Term(type=IRI, iri=chunk_uri)
))
# Output entity name as context for direct name matching
# Include chunk_id for embedding provenance
entities.append(EntityContext(

View file

@ -15,7 +15,7 @@ logger = logging.getLogger(__name__)
from .... schema import Chunk, Triple, Triples
from .... schema import Metadata, Term, IRI, LITERAL
from .... schema import PromptRequest, PromptResponse
from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
from .... base import PromptClientSpec, ParameterSpec
@ -24,7 +24,6 @@ from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph
from .... flow_version import __version__ as COMPONENT_VERSION
RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF)
default_ident = "kg-extract-relationships"
default_concurrency = 1
@ -185,21 +184,6 @@ class Processor(FlowProcessor):
o=Term(type=LITERAL, value=str(o))
))
# Link entity to chunk (not top-level document)
triples.append(Triple(
s=s_value,
p=SUBJECT_OF_VALUE,
o=Term(type=IRI, iri=chunk_uri)
))
if rel["object-entity"]:
# Link object entity to chunk
triples.append(Triple(
s=o_value,
p=SUBJECT_OF_VALUE,
o=Term(type=IRI, iri=chunk_uri)
))
# Generate subgraph provenance once for all extracted triples
if extracted_triples:
sg_uri = subgraph_uri()