Remove schema:subjectOf edges from KG extraction (#695)

The subjectOf triples were redundant with the subgraph provenance model
introduced in e8407b34. Entity-to-source lineage can be traced via
tg:contains -> subgraph -> prov:wasDerivedFrom -> chunk, making the
direct subjectOf edges unnecessary metadata polluting the knowledge graph.

Removed from all three extractors (agent, definitions, relationships),
cleaned up the SUBJECT_OF constant and vocabulary label, and updated
tests accordingly.
This commit is contained in:
cybermaggedon 2026-03-13 12:11:21 +00:00 committed by GitHub
parent 64e3f6bd0d
commit e6623fc915
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 9 additions and 88 deletions

View file

@ -14,7 +14,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
from trustgraph.schema import EntityContext, EntityContexts, AgentRequest, AgentResponse from trustgraph.schema import EntityContext, EntityContexts, AgentRequest, AgentResponse
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
from trustgraph.template.prompt_manager import PromptManager from trustgraph.template.prompt_manager import PromptManager
@ -174,10 +174,6 @@ class TestAgentKgExtractionIntegration:
label_triples = [t for t in sent_triples.triples if t.p.iri == RDF_LABEL] label_triples = [t for t in sent_triples.triples if t.p.iri == RDF_LABEL]
assert len(label_triples) >= 2 # Should have labels for entities assert len(label_triples) >= 2 # Should have labels for entities
# Check subject-of relationships
subject_of_triples = [t for t in sent_triples.triples if t.p.iri == SUBJECT_OF]
assert len(subject_of_triples) >= 2 # Entities should be linked to document
# Verify entity contexts were emitted # Verify entity contexts were emitted
entity_contexts_publisher = mock_flow_context("entity-contexts") entity_contexts_publisher = mock_flow_context("entity-contexts")
entity_contexts_publisher.send.assert_called_once() entity_contexts_publisher.send.assert_called_once()

View file

@ -17,7 +17,7 @@ from trustgraph.extract.kg.relationships.extract import Processor as Relationshi
from trustgraph.storage.knowledge.store import Processor as KnowledgeStoreProcessor from trustgraph.storage.knowledge.store import Processor as KnowledgeStoreProcessor
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
from trustgraph.schema import EntityContext, EntityContexts, GraphEmbeddings, EntityEmbeddings from trustgraph.schema import EntityContext, EntityContexts, GraphEmbeddings, EntityEmbeddings
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
@pytest.mark.integration @pytest.mark.integration

View file

@ -13,7 +13,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
from trustgraph.schema import EntityContext, EntityContexts from trustgraph.schema import EntityContext, EntityContexts
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
from trustgraph.template.prompt_manager import PromptManager from trustgraph.template.prompt_manager import PromptManager
@ -183,12 +183,6 @@ This is not JSON at all
assert def_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning" assert def_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
assert def_triple.o.value == "A subset of AI that enables learning from data." assert def_triple.o.value == "A subset of AI that enables learning from data."
# Check subject-of triple
subject_of_triple = next((t for t in triples if t.p.iri == SUBJECT_OF), None)
assert subject_of_triple is not None
assert subject_of_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
assert subject_of_triple.o.iri == "doc123"
# Check entity context # Check entity context
assert len(entity_contexts) == 1 assert len(entity_contexts) == 1
assert entity_contexts[0].entity.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning" assert entity_contexts[0].entity.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
@ -228,10 +222,6 @@ This is not JSON at all
assert rel_triple.o.iri == object_uri assert rel_triple.o.iri == object_uri
assert rel_triple.o.type == IRI assert rel_triple.o.type == IRI
# Check subject-of relationships
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF and t.o.iri == "doc123"]
assert len(subject_of_triples) >= 2 # At least subject and predicate should have subject-of relations
def test_process_extraction_data_literal_object(self, agent_extractor, sample_metadata): def test_process_extraction_data_literal_object(self, agent_extractor, sample_metadata):
"""Test processing of relationships with literal objects""" """Test processing of relationships with literal objects"""
data = [ data = [
@ -274,10 +264,6 @@ This is not JSON at all
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, metadata) triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, metadata)
# Should not create subject-of relationships when no metadata ID
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
assert len(subject_of_triples) == 0
# Should still create entity contexts # Should still create entity contexts
assert len(entity_contexts) == 1 assert len(entity_contexts) == 1

View file

@ -13,7 +13,7 @@ from unittest.mock import AsyncMock, MagicMock
from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
from trustgraph.schema import EntityContext, EntityContexts from trustgraph.schema import EntityContext, EntityContexts
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
@pytest.mark.unit @pytest.mark.unit
@ -187,10 +187,6 @@ class TestAgentKgExtractionEdgeCases:
data = [{"type": "definition", "entity": "Test", "definition": "Test def"}] data = [{"type": "definition", "entity": "Test", "definition": "Test def"}]
triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata) triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
# Should not create subject-of triples when ID is empty string
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
assert len(subject_of_triples) == 0
def test_process_extraction_data_special_entity_names(self, agent_extractor): def test_process_extraction_data_special_entity_names(self, agent_extractor):
"""Test processing with special characters in entity names""" """Test processing with special characters in entity names"""
metadata = Metadata(id="doc123") metadata = Metadata(id="doc123")
@ -338,7 +334,7 @@ class TestAgentKgExtractionEdgeCases:
# Should process all relationships # Should process all relationships
# Note: The current implementation has some logic issues that these tests document # Note: The current implementation has some logic issues that these tests document
assert len([t for t in triples if t.p.iri != RDF_LABEL and t.p.iri != SUBJECT_OF]) >= 7 assert len([t for t in triples if t.p.iri != RDF_LABEL]) >= 7
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_emit_empty_collections(self, agent_extractor): async def test_emit_empty_collections(self, agent_extractor):

View file

@ -30,7 +30,6 @@ RDFS_LABEL = RDFS + "label"
# Schema.org namespace # Schema.org namespace
SCHEMA = "https://schema.org/" SCHEMA = "https://schema.org/"
SCHEMA_SUBJECT_OF = SCHEMA + "subjectOf"
SCHEMA_DIGITAL_DOCUMENT = SCHEMA + "DigitalDocument" SCHEMA_DIGITAL_DOCUMENT = SCHEMA + "DigitalDocument"
SCHEMA_DESCRIPTION = SCHEMA + "description" SCHEMA_DESCRIPTION = SCHEMA + "description"
SCHEMA_KEYWORDS = SCHEMA + "keywords" SCHEMA_KEYWORDS = SCHEMA + "keywords"

View file

@ -16,7 +16,7 @@ from . namespaces import (
PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY, PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME, PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR, DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
SCHEMA_SUBJECT_OF, SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION, SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION,
SCHEMA_KEYWORDS, SCHEMA_NAME, SCHEMA_KEYWORDS, SCHEMA_NAME,
SKOS_DEFINITION, SKOS_DEFINITION,
TG_CONTAINS, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER, TG_CONTAINS, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
@ -63,7 +63,6 @@ DC_PREDICATE_LABELS = [
# Schema.org labels # Schema.org labels
SCHEMA_LABELS = [ SCHEMA_LABELS = [
_label_triple(SCHEMA_SUBJECT_OF, "subject of"),
_label_triple(SCHEMA_DIGITAL_DOCUMENT, "Digital Document"), _label_triple(SCHEMA_DIGITAL_DOCUMENT, "Digital Document"),
_label_triple(SCHEMA_DESCRIPTION, "description"), _label_triple(SCHEMA_DESCRIPTION, "description"),
_label_triple(SCHEMA_KEYWORDS, "keywords"), _label_triple(SCHEMA_KEYWORDS, "keywords"),

View file

@ -2,7 +2,6 @@
RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
RDF_LABEL = "http://www.w3.org/2000/01/rdf-schema#label" RDF_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
DEFINITION = "http://www.w3.org/2004/02/skos/core#definition" DEFINITION = "http://www.w3.org/2004/02/skos/core#definition"
SUBJECT_OF = "https://schema.org/subjectOf"
TRUSTGRAPH_ENTITIES = "http://trustgraph.ai/e/" TRUSTGRAPH_ENTITIES = "http://trustgraph.ai/e/"

View file

@ -6,7 +6,7 @@ import logging
from ....schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL from ....schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
from ....schema import EntityContext, EntityContexts from ....schema import EntityContext, EntityContexts
from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, SUBJECT_OF, DEFINITION from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, DEFINITION
from ....base import FlowProcessor, ConsumerSpec, ProducerSpec from ....base import FlowProcessor, ConsumerSpec, ProducerSpec
from ....base import AgentClientSpec from ....base import AgentClientSpec
@ -269,14 +269,6 @@ class Processor(FlowProcessor):
triples.append(definition_triple) triples.append(definition_triple)
extracted_triples.append(definition_triple) extracted_triples.append(definition_triple)
# Add subject-of relationship to document
if metadata.id:
triples.append(Triple(
s = Term(type=IRI, iri=entity_uri),
p = Term(type=IRI, iri=SUBJECT_OF),
o = Term(type=IRI, iri=metadata.id),
))
# Create entity context for embeddings # Create entity context for embeddings
entity_contexts.append(EntityContext( entity_contexts.append(EntityContext(
entity=Term(type=IRI, iri=entity_uri), entity=Term(type=IRI, iri=entity_uri),
@ -327,27 +319,6 @@ class Processor(FlowProcessor):
triples.append(relationship_triple) triples.append(relationship_triple)
extracted_triples.append(relationship_triple) extracted_triples.append(relationship_triple)
# Add subject-of relationships to document
if metadata.id:
triples.append(Triple(
s = subject_value,
p = Term(type=IRI, iri=SUBJECT_OF),
o = Term(type=IRI, iri=metadata.id),
))
triples.append(Triple(
s = predicate_value,
p = Term(type=IRI, iri=SUBJECT_OF),
o = Term(type=IRI, iri=metadata.id),
))
if rel.get("object-entity", True):
triples.append(Triple(
s = object_value,
p = Term(type=IRI, iri=SUBJECT_OF),
o = Term(type=IRI, iri=metadata.id),
))
return triples, entity_contexts, extracted_triples return triples, entity_contexts, extracted_triples
@staticmethod @staticmethod

View file

@ -15,7 +15,7 @@ from .... schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
from .... schema import EntityContext, EntityContexts from .... schema import EntityContext, EntityContexts
from .... schema import PromptRequest, PromptResponse from .... schema import PromptRequest, PromptResponse
from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
from .... base import PromptClientSpec, ParameterSpec from .... base import PromptClientSpec, ParameterSpec
@ -25,8 +25,6 @@ from .... flow_version import __version__ as COMPONENT_VERSION
DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION) DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION)
RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL) RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF)
default_ident = "kg-extract-definitions" default_ident = "kg-extract-definitions"
default_concurrency = 1 default_concurrency = 1
default_triples_batch_size = 50 default_triples_batch_size = 50
@ -176,13 +174,6 @@ class Processor(FlowProcessor):
triples.append(definition_triple) triples.append(definition_triple)
extracted_triples.append(definition_triple) extracted_triples.append(definition_triple)
# Link entity to chunk (not top-level document)
triples.append(Triple(
s=s_value,
p=SUBJECT_OF_VALUE,
o=Term(type=IRI, iri=chunk_uri)
))
# Output entity name as context for direct name matching # Output entity name as context for direct name matching
# Include chunk_id for embedding provenance # Include chunk_id for embedding provenance
entities.append(EntityContext( entities.append(EntityContext(

View file

@ -15,7 +15,7 @@ logger = logging.getLogger(__name__)
from .... schema import Chunk, Triple, Triples from .... schema import Chunk, Triple, Triples
from .... schema import Metadata, Term, IRI, LITERAL from .... schema import Metadata, Term, IRI, LITERAL
from .... schema import PromptRequest, PromptResponse from .... schema import PromptRequest, PromptResponse
from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
from .... base import PromptClientSpec, ParameterSpec from .... base import PromptClientSpec, ParameterSpec
@ -24,7 +24,6 @@ from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph
from .... flow_version import __version__ as COMPONENT_VERSION from .... flow_version import __version__ as COMPONENT_VERSION
RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL) RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF)
default_ident = "kg-extract-relationships" default_ident = "kg-extract-relationships"
default_concurrency = 1 default_concurrency = 1
@ -185,21 +184,6 @@ class Processor(FlowProcessor):
o=Term(type=LITERAL, value=str(o)) o=Term(type=LITERAL, value=str(o))
)) ))
# Link entity to chunk (not top-level document)
triples.append(Triple(
s=s_value,
p=SUBJECT_OF_VALUE,
o=Term(type=IRI, iri=chunk_uri)
))
if rel["object-entity"]:
# Link object entity to chunk
triples.append(Triple(
s=o_value,
p=SUBJECT_OF_VALUE,
o=Term(type=IRI, iri=chunk_uri)
))
# Generate subgraph provenance once for all extracted triples # Generate subgraph provenance once for all extracted triples
if extracted_triples: if extracted_triples:
sg_uri = subgraph_uri() sg_uri = subgraph_uri()