Remove schema:subjectOf edges from KG extraction (#695)

The subjectOf triples were redundant with the subgraph provenance model
introduced in e8407b34. Entity-to-source lineage can be traced via
tg:contains -> subgraph -> prov:wasDerivedFrom -> chunk, making the
direct subjectOf edges unnecessary metadata polluting the knowledge graph.

Removed from all three extractors (agent, definitions, relationships),
cleaned up the SUBJECT_OF constant and vocabulary label, and updated
tests accordingly.
This commit is contained in:
cybermaggedon 2026-03-13 12:11:21 +00:00 committed by GitHub
parent 64e3f6bd0d
commit e6623fc915
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 9 additions and 88 deletions

View file

@ -30,7 +30,6 @@ RDFS_LABEL = RDFS + "label"
# Schema.org namespace
SCHEMA = "https://schema.org/"
SCHEMA_SUBJECT_OF = SCHEMA + "subjectOf"
SCHEMA_DIGITAL_DOCUMENT = SCHEMA + "DigitalDocument"
SCHEMA_DESCRIPTION = SCHEMA + "description"
SCHEMA_KEYWORDS = SCHEMA + "keywords"

View file

@ -16,7 +16,7 @@ from . namespaces import (
PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
SCHEMA_SUBJECT_OF, SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION,
SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION,
SCHEMA_KEYWORDS, SCHEMA_NAME,
SKOS_DEFINITION,
TG_CONTAINS, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
@ -63,7 +63,6 @@ DC_PREDICATE_LABELS = [
# Schema.org labels
SCHEMA_LABELS = [
_label_triple(SCHEMA_SUBJECT_OF, "subject of"),
_label_triple(SCHEMA_DIGITAL_DOCUMENT, "Digital Document"),
_label_triple(SCHEMA_DESCRIPTION, "description"),
_label_triple(SCHEMA_KEYWORDS, "keywords"),

View file

@ -2,7 +2,6 @@
RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
RDF_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
DEFINITION = "http://www.w3.org/2004/02/skos/core#definition"
SUBJECT_OF = "https://schema.org/subjectOf"
TRUSTGRAPH_ENTITIES = "http://trustgraph.ai/e/"