From e6623fc9153e5af420ef480f88c2ccdc8e6ab4ff Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Fri, 13 Mar 2026 12:11:21 +0000 Subject: [PATCH] Remove schema:subjectOf edges from KG extraction (#695) The subjectOf triples were redundant with the subgraph provenance model introduced in e8407b34. Entity-to-source lineage can be traced via tg:contains -> subgraph -> prov:wasDerivedFrom -> chunk, making the direct subjectOf edges unnecessary metadata polluting the knowledge graph. Removed from all three extractors (agent, definitions, relationships), cleaned up the SUBJECT_OF constant and vocabulary label, and updated tests accordingly. --- .../test_agent_kg_extraction_integration.py | 6 +--- .../test_kg_extract_store_integration.py | 2 +- .../test_agent_extraction.py | 16 +--------- .../test_agent_extraction_edge_cases.py | 8 ++--- .../trustgraph/provenance/namespaces.py | 1 - .../trustgraph/provenance/vocabulary.py | 3 +- trustgraph-base/trustgraph/rdf.py | 1 - .../trustgraph/extract/kg/agent/extract.py | 31 +------------------ .../extract/kg/definitions/extract.py | 11 +------ .../extract/kg/relationships/extract.py | 18 +---------- 10 files changed, 9 insertions(+), 88 deletions(-) diff --git a/tests/integration/test_agent_kg_extraction_integration.py b/tests/integration/test_agent_kg_extraction_integration.py index a2576274..579498db 100644 --- a/tests/integration/test_agent_kg_extraction_integration.py +++ b/tests/integration/test_agent_kg_extraction_integration.py @@ -14,7 +14,7 @@ from unittest.mock import AsyncMock, MagicMock, patch from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL from trustgraph.schema import EntityContext, EntityContexts, AgentRequest, AgentResponse -from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF +from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL from trustgraph.template.prompt_manager import PromptManager @@ -174,10 +174,6 @@ class TestAgentKgExtractionIntegration: label_triples = [t for t in sent_triples.triples if t.p.iri == RDF_LABEL] assert len(label_triples) >= 2 # Should have labels for entities - # Check subject-of relationships - subject_of_triples = [t for t in sent_triples.triples if t.p.iri == SUBJECT_OF] - assert len(subject_of_triples) >= 2 # Entities should be linked to document - # Verify entity contexts were emitted entity_contexts_publisher = mock_flow_context("entity-contexts") entity_contexts_publisher.send.assert_called_once() diff --git a/tests/integration/test_kg_extract_store_integration.py b/tests/integration/test_kg_extract_store_integration.py index 56c30144..4d8b60ad 100644 --- a/tests/integration/test_kg_extract_store_integration.py +++ b/tests/integration/test_kg_extract_store_integration.py @@ -17,7 +17,7 @@ from trustgraph.extract.kg.relationships.extract import Processor as Relationshi from trustgraph.storage.knowledge.store import Processor as KnowledgeStoreProcessor from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL from trustgraph.schema import EntityContext, EntityContexts, GraphEmbeddings, EntityEmbeddings -from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF +from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL @pytest.mark.integration diff --git a/tests/unit/test_knowledge_graph/test_agent_extraction.py b/tests/unit/test_knowledge_graph/test_agent_extraction.py index d2824c0c..ec985e3b 100644 --- a/tests/unit/test_knowledge_graph/test_agent_extraction.py +++ b/tests/unit/test_knowledge_graph/test_agent_extraction.py @@ -13,7 +13,7 @@ from unittest.mock import AsyncMock, MagicMock, patch from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL from trustgraph.schema import EntityContext, EntityContexts -from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF +from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL from trustgraph.template.prompt_manager import PromptManager @@ -183,12 +183,6 @@ This is not JSON at all assert def_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning" assert def_triple.o.value == "A subset of AI that enables learning from data." - # Check subject-of triple - subject_of_triple = next((t for t in triples if t.p.iri == SUBJECT_OF), None) - assert subject_of_triple is not None - assert subject_of_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning" - assert subject_of_triple.o.iri == "doc123" - # Check entity context assert len(entity_contexts) == 1 assert entity_contexts[0].entity.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning" @@ -228,10 +222,6 @@ This is not JSON at all assert rel_triple.o.iri == object_uri assert rel_triple.o.type == IRI - # Check subject-of relationships - subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF and t.o.iri == "doc123"] - assert len(subject_of_triples) >= 2 # At least subject and predicate should have subject-of relations - def test_process_extraction_data_literal_object(self, agent_extractor, sample_metadata): """Test processing of relationships with literal objects""" data = [ @@ -274,10 +264,6 @@ This is not JSON at all triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, metadata) - # Should not create subject-of relationships when no metadata ID - subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF] - assert len(subject_of_triples) == 0 - # Should still create entity contexts assert len(entity_contexts) == 1 diff --git a/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py b/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py index ac20fe11..b0be3f06 100644 --- a/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py +++ b/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py @@ -13,7 +13,7 @@ from unittest.mock import AsyncMock, MagicMock from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL from trustgraph.schema import EntityContext, EntityContexts -from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF +from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL @pytest.mark.unit @@ -187,10 +187,6 @@ class TestAgentKgExtractionEdgeCases: data = [{"type": "definition", "entity": "Test", "definition": "Test def"}] triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata) - # Should not create subject-of triples when ID is empty string - subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF] - assert len(subject_of_triples) == 0 - def test_process_extraction_data_special_entity_names(self, agent_extractor): """Test processing with special characters in entity names""" metadata = Metadata(id="doc123") @@ -338,7 +334,7 @@ class TestAgentKgExtractionEdgeCases: # Should process all relationships # Note: The current implementation has some logic issues that these tests document - assert len([t for t in triples if t.p.iri != RDF_LABEL and t.p.iri != SUBJECT_OF]) >= 7 + assert len([t for t in triples if t.p.iri != RDF_LABEL]) >= 7 @pytest.mark.asyncio async def test_emit_empty_collections(self, agent_extractor): diff --git a/trustgraph-base/trustgraph/provenance/namespaces.py b/trustgraph-base/trustgraph/provenance/namespaces.py index e60dee16..4c1ab7bf 100644 --- a/trustgraph-base/trustgraph/provenance/namespaces.py +++ b/trustgraph-base/trustgraph/provenance/namespaces.py @@ -30,7 +30,6 @@ RDFS_LABEL = RDFS + "label" # Schema.org namespace SCHEMA = "https://schema.org/" -SCHEMA_SUBJECT_OF = SCHEMA + "subjectOf" SCHEMA_DIGITAL_DOCUMENT = SCHEMA + "DigitalDocument" SCHEMA_DESCRIPTION = SCHEMA + "description" SCHEMA_KEYWORDS = SCHEMA + "keywords" diff --git a/trustgraph-base/trustgraph/provenance/vocabulary.py b/trustgraph-base/trustgraph/provenance/vocabulary.py index f9ae1abd..4ad2e59b 100644 --- a/trustgraph-base/trustgraph/provenance/vocabulary.py +++ b/trustgraph-base/trustgraph/provenance/vocabulary.py @@ -16,7 +16,7 @@ from . namespaces import ( PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY, PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME, DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR, - SCHEMA_SUBJECT_OF, SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION, + SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION, SCHEMA_KEYWORDS, SCHEMA_NAME, SKOS_DEFINITION, TG_CONTAINS, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER, @@ -63,7 +63,6 @@ DC_PREDICATE_LABELS = [ # Schema.org labels SCHEMA_LABELS = [ - _label_triple(SCHEMA_SUBJECT_OF, "subject of"), _label_triple(SCHEMA_DIGITAL_DOCUMENT, "Digital Document"), _label_triple(SCHEMA_DESCRIPTION, "description"), _label_triple(SCHEMA_KEYWORDS, "keywords"), diff --git a/trustgraph-base/trustgraph/rdf.py b/trustgraph-base/trustgraph/rdf.py index 32799b8d..1d3b7cba 100644 --- a/trustgraph-base/trustgraph/rdf.py +++ b/trustgraph-base/trustgraph/rdf.py @@ -2,7 +2,6 @@ RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" RDF_LABEL = "http://www.w3.org/2000/01/rdf-schema#label" DEFINITION = "http://www.w3.org/2004/02/skos/core#definition" -SUBJECT_OF = "https://schema.org/subjectOf" TRUSTGRAPH_ENTITIES = "http://trustgraph.ai/e/" diff --git a/trustgraph-flow/trustgraph/extract/kg/agent/extract.py b/trustgraph-flow/trustgraph/extract/kg/agent/extract.py index 28dba11a..5ce343c6 100644 --- a/trustgraph-flow/trustgraph/extract/kg/agent/extract.py +++ b/trustgraph-flow/trustgraph/extract/kg/agent/extract.py @@ -6,7 +6,7 @@ import logging from ....schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL from ....schema import EntityContext, EntityContexts -from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, SUBJECT_OF, DEFINITION +from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, DEFINITION from ....base import FlowProcessor, ConsumerSpec, ProducerSpec from ....base import AgentClientSpec @@ -269,14 +269,6 @@ class Processor(FlowProcessor): triples.append(definition_triple) extracted_triples.append(definition_triple) - # Add subject-of relationship to document - if metadata.id: - triples.append(Triple( - s = Term(type=IRI, iri=entity_uri), - p = Term(type=IRI, iri=SUBJECT_OF), - o = Term(type=IRI, iri=metadata.id), - )) - # Create entity context for embeddings entity_contexts.append(EntityContext( entity=Term(type=IRI, iri=entity_uri), @@ -327,27 +319,6 @@ class Processor(FlowProcessor): triples.append(relationship_triple) extracted_triples.append(relationship_triple) - # Add subject-of relationships to document - if metadata.id: - triples.append(Triple( - s = subject_value, - p = Term(type=IRI, iri=SUBJECT_OF), - o = Term(type=IRI, iri=metadata.id), - )) - - triples.append(Triple( - s = predicate_value, - p = Term(type=IRI, iri=SUBJECT_OF), - o = Term(type=IRI, iri=metadata.id), - )) - - if rel.get("object-entity", True): - triples.append(Triple( - s = object_value, - p = Term(type=IRI, iri=SUBJECT_OF), - o = Term(type=IRI, iri=metadata.id), - )) - return triples, entity_contexts, extracted_triples @staticmethod diff --git a/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py b/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py index 44ea778d..2bb88c8a 100755 --- a/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py +++ b/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py @@ -15,7 +15,7 @@ from .... schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL logger = logging.getLogger(__name__) from .... schema import EntityContext, EntityContexts from .... schema import PromptRequest, PromptResponse -from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF +from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL from .... base import FlowProcessor, ConsumerSpec, ProducerSpec from .... base import PromptClientSpec, ParameterSpec @@ -25,8 +25,6 @@ from .... flow_version import __version__ as COMPONENT_VERSION DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION) RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL) -SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF) - default_ident = "kg-extract-definitions" default_concurrency = 1 default_triples_batch_size = 50 @@ -176,13 +174,6 @@ class Processor(FlowProcessor): triples.append(definition_triple) extracted_triples.append(definition_triple) - # Link entity to chunk (not top-level document) - triples.append(Triple( - s=s_value, - p=SUBJECT_OF_VALUE, - o=Term(type=IRI, iri=chunk_uri) - )) - # Output entity name as context for direct name matching # Include chunk_id for embedding provenance entities.append(EntityContext( diff --git a/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py b/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py index 604de1df..b557ec32 100755 --- a/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py +++ b/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py @@ -15,7 +15,7 @@ logger = logging.getLogger(__name__) from .... schema import Chunk, Triple, Triples from .... schema import Metadata, Term, IRI, LITERAL from .... schema import PromptRequest, PromptResponse -from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF +from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES from .... base import FlowProcessor, ConsumerSpec, ProducerSpec from .... base import PromptClientSpec, ParameterSpec @@ -24,7 +24,6 @@ from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph from .... flow_version import __version__ as COMPONENT_VERSION RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL) -SUBJECT_OF_VALUE = Term(type=IRI, iri=SUBJECT_OF) default_ident = "kg-extract-relationships" default_concurrency = 1 @@ -185,21 +184,6 @@ class Processor(FlowProcessor): o=Term(type=LITERAL, value=str(o)) )) - # Link entity to chunk (not top-level document) - triples.append(Triple( - s=s_value, - p=SUBJECT_OF_VALUE, - o=Term(type=IRI, iri=chunk_uri) - )) - - if rel["object-entity"]: - # Link object entity to chunk - triples.append(Triple( - s=o_value, - p=SUBJECT_OF_VALUE, - o=Term(type=IRI, iri=chunk_uri) - )) - # Generate subgraph provenance once for all extracted triples if extracted_triples: sg_uri = subgraph_uri()