Remove schema:subjectOf edges from KG extraction (#695)

The subjectOf triples were redundant with the subgraph provenance model
introduced in e8407b34. Entity-to-source lineage can be traced via
tg:contains -> subgraph -> prov:wasDerivedFrom -> chunk, making the
direct subjectOf edges unnecessary metadata polluting the knowledge graph.

Removed from all three extractors (agent, definitions, relationships),
cleaned up the SUBJECT_OF constant and vocabulary label, and updated
tests accordingly.
This commit is contained in:
cybermaggedon 2026-03-13 12:11:21 +00:00 committed by GitHub
parent 64e3f6bd0d
commit e6623fc915
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 9 additions and 88 deletions

View file

@ -14,7 +14,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
from trustgraph.schema import EntityContext, EntityContexts, AgentRequest, AgentResponse
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
from trustgraph.template.prompt_manager import PromptManager
@ -174,10 +174,6 @@ class TestAgentKgExtractionIntegration:
label_triples = [t for t in sent_triples.triples if t.p.iri == RDF_LABEL]
assert len(label_triples) >= 2 # Should have labels for entities
# Check subject-of relationships
subject_of_triples = [t for t in sent_triples.triples if t.p.iri == SUBJECT_OF]
assert len(subject_of_triples) >= 2 # Entities should be linked to document
# Verify entity contexts were emitted
entity_contexts_publisher = mock_flow_context("entity-contexts")
entity_contexts_publisher.send.assert_called_once()

View file

@ -17,7 +17,7 @@ from trustgraph.extract.kg.relationships.extract import Processor as Relationshi
from trustgraph.storage.knowledge.store import Processor as KnowledgeStoreProcessor
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
from trustgraph.schema import EntityContext, EntityContexts, GraphEmbeddings, EntityEmbeddings
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
@pytest.mark.integration

View file

@ -13,7 +13,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
from trustgraph.schema import EntityContext, EntityContexts
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
from trustgraph.template.prompt_manager import PromptManager
@ -183,12 +183,6 @@ This is not JSON at all
assert def_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
assert def_triple.o.value == "A subset of AI that enables learning from data."
# Check subject-of triple
subject_of_triple = next((t for t in triples if t.p.iri == SUBJECT_OF), None)
assert subject_of_triple is not None
assert subject_of_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
assert subject_of_triple.o.iri == "doc123"
# Check entity context
assert len(entity_contexts) == 1
assert entity_contexts[0].entity.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
@ -228,10 +222,6 @@ This is not JSON at all
assert rel_triple.o.iri == object_uri
assert rel_triple.o.type == IRI
# Check subject-of relationships
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF and t.o.iri == "doc123"]
assert len(subject_of_triples) >= 2 # At least subject and predicate should have subject-of relations
def test_process_extraction_data_literal_object(self, agent_extractor, sample_metadata):
"""Test processing of relationships with literal objects"""
data = [
@ -274,10 +264,6 @@ This is not JSON at all
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, metadata)
# Should not create subject-of relationships when no metadata ID
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
assert len(subject_of_triples) == 0
# Should still create entity contexts
assert len(entity_contexts) == 1

View file

@ -13,7 +13,7 @@ from unittest.mock import AsyncMock, MagicMock
from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
from trustgraph.schema import EntityContext, EntityContexts
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
@pytest.mark.unit
@ -187,10 +187,6 @@ class TestAgentKgExtractionEdgeCases:
data = [{"type": "definition", "entity": "Test", "definition": "Test def"}]
triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
# Should not create subject-of triples when ID is empty string
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
assert len(subject_of_triples) == 0
def test_process_extraction_data_special_entity_names(self, agent_extractor):
"""Test processing with special characters in entity names"""
metadata = Metadata(id="doc123")
@ -338,7 +334,7 @@ class TestAgentKgExtractionEdgeCases:
# Should process all relationships
# Note: The current implementation has some logic issues that these tests document
assert len([t for t in triples if t.p.iri != RDF_LABEL and t.p.iri != SUBJECT_OF]) >= 7
assert len([t for t in triples if t.p.iri != RDF_LABEL]) >= 7
@pytest.mark.asyncio
async def test_emit_empty_collections(self, agent_extractor):

View file

@ -30,7 +30,6 @@ RDFS_LABEL = RDFS + "label"
# Schema.org namespace
SCHEMA = "https://schema.org/"
SCHEMA_SUBJECT_OF = SCHEMA + "subjectOf"
SCHEMA_DIGITAL_DOCUMENT = SCHEMA + "DigitalDocument"
SCHEMA_DESCRIPTION = SCHEMA + "description"
SCHEMA_KEYWORDS = SCHEMA + "keywords"

View file

@ -16,7 +16,7 @@ from . namespaces import (
PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
SCHEMA_SUBJECT_OF, SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION,
SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION,
SCHEMA_KEYWORDS, SCHEMA_NAME,
SKOS_DEFINITION,
TG_CONTAINS, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
@ -63,7 +63,6 @@ DC_PREDICATE_LABELS = [
# Schema.org labels
SCHEMA_LABELS = [
_label_triple(SCHEMA_SUBJECT_OF, "subject of"),
_label_triple(SCHEMA_DIGITAL_DOCUMENT, "Digital Document"),
_label_triple(SCHEMA_DESCRIPTION, "description"),
_label_triple(SCHEMA_KEYWORDS, "keywords"),

View file

@ -2,7 +2,6 @@
RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
RDF_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
DEFINITION = "http://www.w3.org/2004/02/skos/core#definition"
SUBJECT_OF = "https://schema.org/subjectOf"
TRUSTGRAPH_ENTITIES = "http://trustgraph.ai/e/"

View file

@ -6,7 +6,7 @@ import logging
from ....schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
from ....schema import EntityContext, EntityContexts
from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, SUBJECT_OF, DEFINITION
from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, DEFINITION
from ....base import FlowProcessor, ConsumerSpec, ProducerSpec
from ....base import AgentClientSpec
@ -269,14 +269,6 @@ class Processor(FlowProcessor):
triples.append(definition_triple)
extracted_triples.append(definition_triple)
# Add subject-of relationship to document
if metadata.id:
triples.append(Triple(
s = Term(type=IRI, iri=entity_uri),
p = Term(type=IRI, iri=SUBJECT_OF),
o = Term(type=IRI, iri=metadata.id),
))
# Create entity context for embeddings
entity_contexts.append(EntityContext(
entity=Term(type=IRI, iri=entity_uri),
@ -327,27 +319,6 @@ class Processor(FlowProcessor):
triples.append(relationship_triple)
extracted_triples.append(relationship_triple)
# Add subject-of relationships to document
if metadata.id:
triples.append(Triple(
s = subject_value,
p = Term(type=IRI, iri=SUBJECT_OF),
o = Term(type=IRI, iri=metadata.id),
))
triples.append(Triple(
s = predicate_value,
p = Term(type=IRI, iri=SUBJECT_OF),
o = Term(type=IRI, iri=metadata.id),
))
if rel.get("object-entity", True):
triples.append(Triple(
s = object_value,
p = Term(type=IRI, iri=SUBJECT_OF),
o = Term(type=IRI, iri=metadata.id),
))
return triples, entity_contexts, extracted_triples
@staticmethod

View file

@ -15,7 +15,7 @@ from .... schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
logger = logging.getLogger(__name__)
from .... schema import EntityContext, EntityContexts
from .... schema import PromptRequest, PromptResponse
from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
from .... base import PromptClientSpec, ParameterSpec
@ -25,8 +25,6 @@ from .... flow_version import __version__ as COMPONENT_VERSION
DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION)
RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF)
default_ident = "kg-extract-definitions"
default_concurrency = 1
default_triples_batch_size = 50
@ -176,13 +174,6 @@ class Processor(FlowProcessor):
triples.append(definition_triple)
extracted_triples.append(definition_triple)
# Link entity to chunk (not top-level document)
triples.append(Triple(
s=s_value,
p=SUBJECT_OF_VALUE,
o=Term(type=IRI, iri=chunk_uri)
))
# Output entity name as context for direct name matching
# Include chunk_id for embedding provenance
entities.append(EntityContext(

View file

@ -15,7 +15,7 @@ logger = logging.getLogger(__name__)
from .... schema import Chunk, Triple, Triples
from .... schema import Metadata, Term, IRI, LITERAL
from .... schema import PromptRequest, PromptResponse
from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
from .... base import PromptClientSpec, ParameterSpec
@ -24,7 +24,6 @@ from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph
from .... flow_version import __version__ as COMPONENT_VERSION
RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF)
default_ident = "kg-extract-relationships"
default_concurrency = 1
@ -185,21 +184,6 @@ class Processor(FlowProcessor):
o=Term(type=LITERAL, value=str(o))
))
# Link entity to chunk (not top-level document)
triples.append(Triple(
s=s_value,
p=SUBJECT_OF_VALUE,
o=Term(type=IRI, iri=chunk_uri)
))
if rel["object-entity"]:
# Link object entity to chunk
triples.append(Triple(
s=o_value,
p=SUBJECT_OF_VALUE,
o=Term(type=IRI, iri=chunk_uri)
))
# Generate subgraph provenance once for all extracted triples
if extracted_triples:
sg_uri = subgraph_uri()