mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Subgraph provenance (#694)
Replace per-triple provenance reification with subgraph model Extraction provenance previously created a full reification (statement URI, activity, agent) for every single extracted triple, producing ~13 provenance triples per knowledge triple. Since each chunk is processed by a single LLM call, this was both redundant and semantically inaccurate. Now one subgraph object is created per chunk extraction, with tg:contains linking to each extracted triple. For 20 extractions from a chunk this reduces provenance from ~260 triples to ~33. - Rename tg:reifies -> tg:contains, stmt_uri -> subgraph_uri - Replace triple_provenance_triples() with subgraph_provenance_triples() - Refactor kg-extract-definitions and kg-extract-relationships to generate provenance once per chunk instead of per triple - Add subgraph provenance to kg-extract-ontology and kg-extract-agent (previously had none) - Update CLI tools and tech specs to match Also rename tg-show-document-hierarchy to tg-show-extraction-provenance. Added extra typing for extraction provenance, fixed extraction prov CLI
This commit is contained in:
parent
35128ff019
commit
64e3f6bd0d
20 changed files with 463 additions and 193 deletions
|
|
@ -11,6 +11,8 @@ from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, SUBJECT_OF, DEFINITION
|
|||
from ....base import FlowProcessor, ConsumerSpec, ProducerSpec
|
||||
from ....base import AgentClientSpec
|
||||
|
||||
from ....provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE
|
||||
from ....flow_version import __version__ as COMPONENT_VERSION
|
||||
from ....template import PromptManager
|
||||
|
||||
# Module logger
|
||||
|
|
@ -196,9 +198,21 @@ class Processor(FlowProcessor):
|
|||
return
|
||||
|
||||
# Process extraction data
|
||||
triples, entity_contexts = self.process_extraction_data(
|
||||
extraction_data, v.metadata
|
||||
)
|
||||
triples, entity_contexts, extracted_triples = \
|
||||
self.process_extraction_data(extraction_data, v.metadata)
|
||||
|
||||
# Generate subgraph provenance for extracted triples
|
||||
if extracted_triples:
|
||||
chunk_uri = v.metadata.id
|
||||
sg_uri = subgraph_uri()
|
||||
prov_triples = subgraph_provenance_triples(
|
||||
subgraph_uri=sg_uri,
|
||||
extracted_triples=extracted_triples,
|
||||
chunk_uri=chunk_uri,
|
||||
component_name=default_ident,
|
||||
component_version=COMPONENT_VERSION,
|
||||
)
|
||||
triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
|
||||
|
||||
# Emit outputs
|
||||
if triples:
|
||||
|
|
@ -221,8 +235,13 @@ class Processor(FlowProcessor):
|
|||
Data is a flat list of objects with 'type' discriminator field:
|
||||
- {"type": "definition", "entity": "...", "definition": "..."}
|
||||
- {"type": "relationship", "subject": "...", "predicate": "...", "object": "...", "object-entity": bool}
|
||||
|
||||
Returns:
|
||||
Tuple of (all_triples, entity_contexts, extracted_triples) where
|
||||
extracted_triples contains only the core knowledge facts (for provenance).
|
||||
"""
|
||||
triples = []
|
||||
extracted_triples = []
|
||||
entity_contexts = []
|
||||
|
||||
# Categorize items by type
|
||||
|
|
@ -242,11 +261,13 @@ class Processor(FlowProcessor):
|
|||
))
|
||||
|
||||
# Add definition
|
||||
triples.append(Triple(
|
||||
definition_triple = Triple(
|
||||
s = Term(type=IRI, iri=entity_uri),
|
||||
p = Term(type=IRI, iri=DEFINITION),
|
||||
o = Term(type=LITERAL, value=defn["definition"]),
|
||||
))
|
||||
)
|
||||
triples.append(definition_triple)
|
||||
extracted_triples.append(definition_triple)
|
||||
|
||||
# Add subject-of relationship to document
|
||||
if metadata.id:
|
||||
|
|
@ -261,7 +282,7 @@ class Processor(FlowProcessor):
|
|||
entity=Term(type=IRI, iri=entity_uri),
|
||||
context=defn["definition"]
|
||||
))
|
||||
|
||||
|
||||
# Process relationships
|
||||
for rel in relationships:
|
||||
|
||||
|
|
@ -298,11 +319,13 @@ class Processor(FlowProcessor):
|
|||
))
|
||||
|
||||
# Add the main relationship triple
|
||||
triples.append(Triple(
|
||||
relationship_triple = Triple(
|
||||
s = subject_value,
|
||||
p = predicate_value,
|
||||
o = object_value
|
||||
))
|
||||
)
|
||||
triples.append(relationship_triple)
|
||||
extracted_triples.append(relationship_triple)
|
||||
|
||||
# Add subject-of relationships to document
|
||||
if metadata.id:
|
||||
|
|
@ -324,8 +347,8 @@ class Processor(FlowProcessor):
|
|||
p = Term(type=IRI, iri=SUBJECT_OF),
|
||||
o = Term(type=IRI, iri=metadata.id),
|
||||
))
|
||||
|
||||
return triples, entity_contexts
|
||||
|
||||
return triples, entity_contexts, extracted_triples
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
|
|||
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
||||
from .... base import PromptClientSpec, ParameterSpec
|
||||
|
||||
from .... provenance import statement_uri, triple_provenance_triples, set_graph, GRAPH_SOURCE
|
||||
from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE
|
||||
from .... flow_version import __version__ as COMPONENT_VERSION
|
||||
|
||||
DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION)
|
||||
|
|
@ -133,6 +133,7 @@ class Processor(FlowProcessor):
|
|||
raise e
|
||||
|
||||
triples = []
|
||||
extracted_triples = []
|
||||
entities = []
|
||||
|
||||
# Get chunk document ID for provenance linking
|
||||
|
|
@ -173,20 +174,7 @@ class Processor(FlowProcessor):
|
|||
s=s_value, p=DEFINITION_VALUE, o=o_value
|
||||
)
|
||||
triples.append(definition_triple)
|
||||
|
||||
# Generate provenance for the definition triple (reification)
|
||||
# Provenance triples go in the source graph for separation from core knowledge
|
||||
stmt_uri = statement_uri()
|
||||
prov_triples = triple_provenance_triples(
|
||||
stmt_uri=stmt_uri,
|
||||
extracted_triple=definition_triple,
|
||||
chunk_uri=chunk_uri,
|
||||
component_name=default_ident,
|
||||
component_version=COMPONENT_VERSION,
|
||||
llm_model=llm_model,
|
||||
ontology_uri=ontology_uri,
|
||||
)
|
||||
triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
|
||||
extracted_triples.append(definition_triple)
|
||||
|
||||
# Link entity to chunk (not top-level document)
|
||||
triples.append(Triple(
|
||||
|
|
@ -211,6 +199,20 @@ class Processor(FlowProcessor):
|
|||
chunk_id=chunk_doc_id,
|
||||
))
|
||||
|
||||
# Generate subgraph provenance once for all extracted triples
|
||||
if extracted_triples:
|
||||
sg_uri = subgraph_uri()
|
||||
prov_triples = subgraph_provenance_triples(
|
||||
subgraph_uri=sg_uri,
|
||||
extracted_triples=extracted_triples,
|
||||
chunk_uri=chunk_uri,
|
||||
component_name=default_ident,
|
||||
component_version=COMPONENT_VERSION,
|
||||
llm_model=llm_model,
|
||||
ontology_uri=ontology_uri,
|
||||
)
|
||||
triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
|
||||
|
||||
# Send triples in batches
|
||||
for i in range(0, len(triples), self.triples_batch_size):
|
||||
batch = triples[i:i + self.triples_batch_size]
|
||||
|
|
|
|||
|
|
@ -23,6 +23,9 @@ from .ontology_selector import OntologySelector, OntologySubset
|
|||
from .simplified_parser import parse_extraction_response
|
||||
from .triple_converter import TripleConverter
|
||||
|
||||
from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE
|
||||
from .... flow_version import __version__ as COMPONENT_VERSION
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
default_ident = "kg-extract-ontology"
|
||||
|
|
@ -306,11 +309,25 @@ class Processor(FlowProcessor):
|
|||
flow, chunk, ontology_subset, prompt_variables
|
||||
)
|
||||
|
||||
# Generate subgraph provenance for extracted triples
|
||||
if triples:
|
||||
chunk_uri = v.metadata.id
|
||||
sg_uri = subgraph_uri()
|
||||
prov_triples = subgraph_provenance_triples(
|
||||
subgraph_uri=sg_uri,
|
||||
extracted_triples=triples,
|
||||
chunk_uri=chunk_uri,
|
||||
component_name=default_ident,
|
||||
component_version=COMPONENT_VERSION,
|
||||
)
|
||||
|
||||
# Generate ontology definition triples
|
||||
ontology_triples = self.build_ontology_triples(ontology_subset)
|
||||
|
||||
# Combine extracted triples with ontology triples
|
||||
# Combine extracted triples with ontology triples and provenance
|
||||
all_triples = triples + ontology_triples
|
||||
if triples:
|
||||
all_triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
|
||||
|
||||
# Build entity contexts from all triples (including ontology elements)
|
||||
entity_contexts = self.build_entity_contexts(all_triples)
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
|
|||
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
||||
from .... base import PromptClientSpec, ParameterSpec
|
||||
|
||||
from .... provenance import statement_uri, triple_provenance_triples, set_graph, GRAPH_SOURCE
|
||||
from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE
|
||||
from .... flow_version import __version__ as COMPONENT_VERSION
|
||||
|
||||
RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
|
||||
|
|
@ -115,6 +115,7 @@ class Processor(FlowProcessor):
|
|||
raise e
|
||||
|
||||
triples = []
|
||||
extracted_triples = []
|
||||
|
||||
# Get chunk document ID for provenance linking
|
||||
chunk_doc_id = v.document_id if v.document_id else v.metadata.id
|
||||
|
|
@ -160,20 +161,7 @@ class Processor(FlowProcessor):
|
|||
o=o_value
|
||||
)
|
||||
triples.append(relationship_triple)
|
||||
|
||||
# Generate provenance for the relationship triple (reification)
|
||||
# Provenance triples go in the source graph for separation from core knowledge
|
||||
stmt_uri = statement_uri()
|
||||
prov_triples = triple_provenance_triples(
|
||||
stmt_uri=stmt_uri,
|
||||
extracted_triple=relationship_triple,
|
||||
chunk_uri=chunk_uri,
|
||||
component_name=default_ident,
|
||||
component_version=COMPONENT_VERSION,
|
||||
llm_model=llm_model,
|
||||
ontology_uri=ontology_uri,
|
||||
)
|
||||
triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
|
||||
extracted_triples.append(relationship_triple)
|
||||
|
||||
# Label for s
|
||||
triples.append(Triple(
|
||||
|
|
@ -212,6 +200,20 @@ class Processor(FlowProcessor):
|
|||
o=Term(type=IRI, iri=chunk_uri)
|
||||
))
|
||||
|
||||
# Generate subgraph provenance once for all extracted triples
|
||||
if extracted_triples:
|
||||
sg_uri = subgraph_uri()
|
||||
prov_triples = subgraph_provenance_triples(
|
||||
subgraph_uri=sg_uri,
|
||||
extracted_triples=extracted_triples,
|
||||
chunk_uri=chunk_uri,
|
||||
component_name=default_ident,
|
||||
component_version=COMPONENT_VERSION,
|
||||
llm_model=llm_model,
|
||||
ontology_uri=ontology_uri,
|
||||
)
|
||||
triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
|
||||
|
||||
# Send triples in batches
|
||||
for i in range(0, len(triples), self.triples_batch_size):
|
||||
batch = triples[i:i + self.triples_batch_size]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue