Subgraph provenance (#694)

Replace per-triple provenance reification with subgraph model

Extraction provenance previously created a full reification (statement
URI, activity, agent) for every single extracted triple, producing ~13
provenance triples per knowledge triple.  Since each chunk is processed
by a single LLM call, this was both redundant and semantically
inaccurate.

Now one subgraph object is created per chunk extraction, with
tg:contains linking to each extracted triple.  For 20 extractions from
a chunk this reduces provenance from ~260 triples to ~33.

- Rename tg:reifies -> tg:contains, stmt_uri -> subgraph_uri
- Replace triple_provenance_triples() with subgraph_provenance_triples()
- Refactor kg-extract-definitions and kg-extract-relationships to
  generate provenance once per chunk instead of per triple
- Add subgraph provenance to kg-extract-ontology and kg-extract-agent
  (previously had none)
- Update CLI tools and tech specs to match

Also rename tg-show-document-hierarchy to tg-show-extraction-provenance.

Added extra typing for extraction provenance, fixed extraction prov CLI
This commit is contained in:
cybermaggedon 2026-03-13 11:37:59 +00:00 committed by GitHub
parent 35128ff019
commit 64e3f6bd0d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 463 additions and 193 deletions

View file

@ -11,6 +11,8 @@ from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, SUBJECT_OF, DEFINITION
from ....base import FlowProcessor, ConsumerSpec, ProducerSpec
from ....base import AgentClientSpec
from ....provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE
from ....flow_version import __version__ as COMPONENT_VERSION
from ....template import PromptManager
# Module logger
@ -196,9 +198,21 @@ class Processor(FlowProcessor):
return
# Process extraction data
triples, entity_contexts = self.process_extraction_data(
extraction_data, v.metadata
)
triples, entity_contexts, extracted_triples = \
self.process_extraction_data(extraction_data, v.metadata)
# Generate subgraph provenance for extracted triples
if extracted_triples:
chunk_uri = v.metadata.id
sg_uri = subgraph_uri()
prov_triples = subgraph_provenance_triples(
subgraph_uri=sg_uri,
extracted_triples=extracted_triples,
chunk_uri=chunk_uri,
component_name=default_ident,
component_version=COMPONENT_VERSION,
)
triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
# Emit outputs
if triples:
@ -221,8 +235,13 @@ class Processor(FlowProcessor):
Data is a flat list of objects with 'type' discriminator field:
- {"type": "definition", "entity": "...", "definition": "..."}
- {"type": "relationship", "subject": "...", "predicate": "...", "object": "...", "object-entity": bool}
Returns:
Tuple of (all_triples, entity_contexts, extracted_triples) where
extracted_triples contains only the core knowledge facts (for provenance).
"""
triples = []
extracted_triples = []
entity_contexts = []
# Categorize items by type
@ -242,11 +261,13 @@ class Processor(FlowProcessor):
))
# Add definition
triples.append(Triple(
definition_triple = Triple(
s = Term(type=IRI, iri=entity_uri),
p = Term(type=IRI, iri=DEFINITION),
o = Term(type=LITERAL, value=defn["definition"]),
))
)
triples.append(definition_triple)
extracted_triples.append(definition_triple)
# Add subject-of relationship to document
if metadata.id:
@ -261,7 +282,7 @@ class Processor(FlowProcessor):
entity=Term(type=IRI, iri=entity_uri),
context=defn["definition"]
))
# Process relationships
for rel in relationships:
@ -298,11 +319,13 @@ class Processor(FlowProcessor):
))
# Add the main relationship triple
triples.append(Triple(
relationship_triple = Triple(
s = subject_value,
p = predicate_value,
o = object_value
))
)
triples.append(relationship_triple)
extracted_triples.append(relationship_triple)
# Add subject-of relationships to document
if metadata.id:
@ -324,8 +347,8 @@ class Processor(FlowProcessor):
p = Term(type=IRI, iri=SUBJECT_OF),
o = Term(type=IRI, iri=metadata.id),
))
return triples, entity_contexts
return triples, entity_contexts, extracted_triples
@staticmethod
def add_args(parser):

View file

@ -20,7 +20,7 @@ from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
from .... base import PromptClientSpec, ParameterSpec
from .... provenance import statement_uri, triple_provenance_triples, set_graph, GRAPH_SOURCE
from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE
from .... flow_version import __version__ as COMPONENT_VERSION
DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION)
@ -133,6 +133,7 @@ class Processor(FlowProcessor):
raise e
triples = []
extracted_triples = []
entities = []
# Get chunk document ID for provenance linking
@ -173,20 +174,7 @@ class Processor(FlowProcessor):
s=s_value, p=DEFINITION_VALUE, o=o_value
)
triples.append(definition_triple)
# Generate provenance for the definition triple (reification)
# Provenance triples go in the source graph for separation from core knowledge
stmt_uri = statement_uri()
prov_triples = triple_provenance_triples(
stmt_uri=stmt_uri,
extracted_triple=definition_triple,
chunk_uri=chunk_uri,
component_name=default_ident,
component_version=COMPONENT_VERSION,
llm_model=llm_model,
ontology_uri=ontology_uri,
)
triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
extracted_triples.append(definition_triple)
# Link entity to chunk (not top-level document)
triples.append(Triple(
@ -211,6 +199,20 @@ class Processor(FlowProcessor):
chunk_id=chunk_doc_id,
))
# Generate subgraph provenance once for all extracted triples
if extracted_triples:
sg_uri = subgraph_uri()
prov_triples = subgraph_provenance_triples(
subgraph_uri=sg_uri,
extracted_triples=extracted_triples,
chunk_uri=chunk_uri,
component_name=default_ident,
component_version=COMPONENT_VERSION,
llm_model=llm_model,
ontology_uri=ontology_uri,
)
triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
# Send triples in batches
for i in range(0, len(triples), self.triples_batch_size):
batch = triples[i:i + self.triples_batch_size]

View file

@ -23,6 +23,9 @@ from .ontology_selector import OntologySelector, OntologySubset
from .simplified_parser import parse_extraction_response
from .triple_converter import TripleConverter
from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE
from .... flow_version import __version__ as COMPONENT_VERSION
logger = logging.getLogger(__name__)
default_ident = "kg-extract-ontology"
@ -306,11 +309,25 @@ class Processor(FlowProcessor):
flow, chunk, ontology_subset, prompt_variables
)
# Generate subgraph provenance for extracted triples
if triples:
chunk_uri = v.metadata.id
sg_uri = subgraph_uri()
prov_triples = subgraph_provenance_triples(
subgraph_uri=sg_uri,
extracted_triples=triples,
chunk_uri=chunk_uri,
component_name=default_ident,
component_version=COMPONENT_VERSION,
)
# Generate ontology definition triples
ontology_triples = self.build_ontology_triples(ontology_subset)
# Combine extracted triples with ontology triples
# Combine extracted triples with ontology triples and provenance
all_triples = triples + ontology_triples
if triples:
all_triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
# Build entity contexts from all triples (including ontology elements)
entity_contexts = self.build_entity_contexts(all_triples)

View file

@ -20,7 +20,7 @@ from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
from .... base import PromptClientSpec, ParameterSpec
from .... provenance import statement_uri, triple_provenance_triples, set_graph, GRAPH_SOURCE
from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE
from .... flow_version import __version__ as COMPONENT_VERSION
RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
@ -115,6 +115,7 @@ class Processor(FlowProcessor):
raise e
triples = []
extracted_triples = []
# Get chunk document ID for provenance linking
chunk_doc_id = v.document_id if v.document_id else v.metadata.id
@ -160,20 +161,7 @@ class Processor(FlowProcessor):
o=o_value
)
triples.append(relationship_triple)
# Generate provenance for the relationship triple (reification)
# Provenance triples go in the source graph for separation from core knowledge
stmt_uri = statement_uri()
prov_triples = triple_provenance_triples(
stmt_uri=stmt_uri,
extracted_triple=relationship_triple,
chunk_uri=chunk_uri,
component_name=default_ident,
component_version=COMPONENT_VERSION,
llm_model=llm_model,
ontology_uri=ontology_uri,
)
triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
extracted_triples.append(relationship_triple)
# Label for s
triples.append(Triple(
@ -212,6 +200,20 @@ class Processor(FlowProcessor):
o=Term(type=IRI, iri=chunk_uri)
))
# Generate subgraph provenance once for all extracted triples
if extracted_triples:
sg_uri = subgraph_uri()
prov_triples = subgraph_provenance_triples(
subgraph_uri=sg_uri,
extracted_triples=extracted_triples,
chunk_uri=chunk_uri,
component_name=default_ident,
component_version=COMPONENT_VERSION,
llm_model=llm_model,
ontology_uri=ontology_uri,
)
triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
# Send triples in batches
for i in range(0, len(triples), self.triples_batch_size):
batch = triples[i:i + self.triples_batch_size]