mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Subgraph provenance (#694)
Replace per-triple provenance reification with subgraph model Extraction provenance previously created a full reification (statement URI, activity, agent) for every single extracted triple, producing ~13 provenance triples per knowledge triple. Since each chunk is processed by a single LLM call, this was both redundant and semantically inaccurate. Now one subgraph object is created per chunk extraction, with tg:contains linking to each extracted triple. For 20 extractions from a chunk this reduces provenance from ~260 triples to ~33. - Rename tg:reifies -> tg:contains, stmt_uri -> subgraph_uri - Replace triple_provenance_triples() with subgraph_provenance_triples() - Refactor kg-extract-definitions and kg-extract-relationships to generate provenance once per chunk instead of per triple - Add subgraph provenance to kg-extract-ontology and kg-extract-agent (previously had none) - Update CLI tools and tech specs to match Also rename tg-show-document-hierarchy to tg-show-extraction-provenance. Added extra typing for extraction provenance, fixed extraction prov CLI
This commit is contained in:
parent
35128ff019
commit
64e3f6bd0d
20 changed files with 463 additions and 193 deletions
|
|
@ -41,7 +41,7 @@ class QuotedTriple:
|
|||
enabling statements about statements.
|
||||
|
||||
Example:
|
||||
# stmt:123 tg:reifies <<:Hope skos:definition "A feeling...">>
|
||||
# subgraph:123 tg:contains <<:Hope skos:definition "A feeling...">>
|
||||
qt = QuotedTriple(
|
||||
s=Uri("https://example.org/Hope"),
|
||||
p=Uri("http://www.w3.org/2004/02/skos/core#definition"),
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
Provenance module for extraction-time provenance support.
|
||||
|
||||
Provides helpers for:
|
||||
- URI generation for documents, pages, chunks, activities, statements
|
||||
- URI generation for documents, pages, chunks, activities, subgraphs
|
||||
- PROV-O triple building for provenance metadata
|
||||
- Vocabulary bootstrap for per-collection initialization
|
||||
|
||||
|
|
@ -38,7 +38,7 @@ from . uris import (
|
|||
chunk_uri_from_page,
|
||||
chunk_uri_from_doc,
|
||||
activity_uri,
|
||||
statement_uri,
|
||||
subgraph_uri,
|
||||
agent_uri,
|
||||
# Query-time provenance URIs (GraphRAG)
|
||||
question_uri,
|
||||
|
|
@ -66,11 +66,13 @@ from . namespaces import (
|
|||
# RDF/RDFS
|
||||
RDF, RDF_TYPE, RDFS, RDFS_LABEL,
|
||||
# TrustGraph
|
||||
TG, TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
|
||||
TG, TG_CONTAINS, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
|
||||
TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
|
||||
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
|
||||
TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
|
||||
TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
|
||||
# Extraction provenance entity types
|
||||
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
|
||||
# Query-time provenance predicates (GraphRAG)
|
||||
TG_QUERY, TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_REASONING, TG_CONTENT,
|
||||
# Query-time provenance predicates (DocumentRAG)
|
||||
|
|
@ -94,7 +96,7 @@ from . namespaces import (
|
|||
from . triples import (
|
||||
document_triples,
|
||||
derived_entity_triples,
|
||||
triple_provenance_triples,
|
||||
subgraph_provenance_triples,
|
||||
# Query-time provenance triple builders (GraphRAG)
|
||||
question_triples,
|
||||
exploration_triples,
|
||||
|
|
@ -121,6 +123,7 @@ from . vocabulary import (
|
|||
PROV_CLASS_LABELS,
|
||||
PROV_PREDICATE_LABELS,
|
||||
DC_PREDICATE_LABELS,
|
||||
TG_CLASS_LABELS,
|
||||
TG_PREDICATE_LABELS,
|
||||
)
|
||||
|
||||
|
|
@ -132,7 +135,7 @@ __all__ = [
|
|||
"chunk_uri_from_page",
|
||||
"chunk_uri_from_doc",
|
||||
"activity_uri",
|
||||
"statement_uri",
|
||||
"subgraph_uri",
|
||||
"agent_uri",
|
||||
# Query-time provenance URIs
|
||||
"question_uri",
|
||||
|
|
@ -153,11 +156,13 @@ __all__ = [
|
|||
"PROV_USED", "PROV_WAS_ASSOCIATED_WITH", "PROV_STARTED_AT_TIME",
|
||||
"DC", "DC_TITLE", "DC_SOURCE", "DC_DATE", "DC_CREATOR",
|
||||
"RDF", "RDF_TYPE", "RDFS", "RDFS_LABEL",
|
||||
"TG", "TG_REIFIES", "TG_PAGE_COUNT", "TG_MIME_TYPE", "TG_PAGE_NUMBER",
|
||||
"TG", "TG_CONTAINS", "TG_PAGE_COUNT", "TG_MIME_TYPE", "TG_PAGE_NUMBER",
|
||||
"TG_CHUNK_INDEX", "TG_CHAR_OFFSET", "TG_CHAR_LENGTH",
|
||||
"TG_CHUNK_SIZE", "TG_CHUNK_OVERLAP", "TG_COMPONENT_VERSION",
|
||||
"TG_LLM_MODEL", "TG_ONTOLOGY", "TG_EMBEDDING_MODEL",
|
||||
"TG_SOURCE_TEXT", "TG_SOURCE_CHAR_OFFSET", "TG_SOURCE_CHAR_LENGTH",
|
||||
# Extraction provenance entity types
|
||||
"TG_DOCUMENT_TYPE", "TG_PAGE_TYPE", "TG_CHUNK_TYPE", "TG_SUBGRAPH_TYPE",
|
||||
# Query-time provenance predicates (GraphRAG)
|
||||
"TG_QUERY", "TG_EDGE_COUNT", "TG_SELECTED_EDGE", "TG_REASONING", "TG_CONTENT",
|
||||
# Query-time provenance predicates (DocumentRAG)
|
||||
|
|
@ -178,7 +183,7 @@ __all__ = [
|
|||
# Triple builders
|
||||
"document_triples",
|
||||
"derived_entity_triples",
|
||||
"triple_provenance_triples",
|
||||
"subgraph_provenance_triples",
|
||||
# Query-time provenance triple builders (GraphRAG)
|
||||
"question_triples",
|
||||
"exploration_triples",
|
||||
|
|
@ -199,5 +204,6 @@ __all__ = [
|
|||
"PROV_CLASS_LABELS",
|
||||
"PROV_PREDICATE_LABELS",
|
||||
"DC_PREDICATE_LABELS",
|
||||
"TG_CLASS_LABELS",
|
||||
"TG_PREDICATE_LABELS",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ SKOS_DEFINITION = SKOS + "definition"
|
|||
|
||||
# TrustGraph namespace for custom predicates
|
||||
TG = "https://trustgraph.ai/ns/"
|
||||
TG_REIFIES = TG + "reifies"
|
||||
TG_CONTAINS = TG + "contains"
|
||||
TG_PAGE_COUNT = TG + "pageCount"
|
||||
TG_MIME_TYPE = TG + "mimeType"
|
||||
TG_PAGE_NUMBER = TG + "pageNumber"
|
||||
|
|
@ -72,6 +72,12 @@ TG_DOCUMENT = TG + "document" # Reference to document in librarian
|
|||
TG_CHUNK_COUNT = TG + "chunkCount"
|
||||
TG_SELECTED_CHUNK = TG + "selectedChunk"
|
||||
|
||||
# Extraction provenance entity types
|
||||
TG_DOCUMENT_TYPE = TG + "Document"
|
||||
TG_PAGE_TYPE = TG + "Page"
|
||||
TG_CHUNK_TYPE = TG + "Chunk"
|
||||
TG_SUBGRAPH_TYPE = TG + "Subgraph"
|
||||
|
||||
# Explainability entity types (shared)
|
||||
TG_QUESTION = TG + "Question"
|
||||
TG_EXPLORATION = TG + "Exploration"
|
||||
|
|
|
|||
|
|
@ -16,7 +16,9 @@ from . namespaces import (
|
|||
TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
|
||||
TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
|
||||
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
|
||||
TG_LLM_MODEL, TG_ONTOLOGY, TG_REIFIES,
|
||||
TG_LLM_MODEL, TG_ONTOLOGY, TG_CONTAINS,
|
||||
# Extraction provenance entity types
|
||||
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
|
||||
# Query-time provenance predicates (GraphRAG)
|
||||
TG_QUERY, TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_EDGE, TG_REASONING, TG_CONTENT,
|
||||
TG_DOCUMENT,
|
||||
|
|
@ -28,7 +30,7 @@ from . namespaces import (
|
|||
TG_GRAPH_RAG_QUESTION, TG_DOC_RAG_QUESTION,
|
||||
)
|
||||
|
||||
from . uris import activity_uri, agent_uri, edge_selection_uri
|
||||
from . uris import activity_uri, agent_uri, subgraph_uri, edge_selection_uri
|
||||
|
||||
|
||||
def set_graph(triples: List[Triple], graph: str) -> List[Triple]:
|
||||
|
|
@ -92,6 +94,7 @@ def document_triples(
|
|||
"""
|
||||
triples = [
|
||||
_triple(doc_uri, RDF_TYPE, _iri(PROV_ENTITY)),
|
||||
_triple(doc_uri, RDF_TYPE, _iri(TG_DOCUMENT_TYPE)),
|
||||
]
|
||||
|
||||
if title:
|
||||
|
|
@ -162,10 +165,23 @@ def derived_entity_triples(
|
|||
act_uri = activity_uri()
|
||||
agt_uri = agent_uri(component_name)
|
||||
|
||||
# Determine specific type from parameters
|
||||
if page_number is not None:
|
||||
specific_type = TG_PAGE_TYPE
|
||||
elif chunk_index is not None:
|
||||
specific_type = TG_CHUNK_TYPE
|
||||
else:
|
||||
specific_type = None
|
||||
|
||||
triples = [
|
||||
# Entity declaration
|
||||
_triple(entity_uri, RDF_TYPE, _iri(PROV_ENTITY)),
|
||||
]
|
||||
|
||||
if specific_type:
|
||||
triples.append(_triple(entity_uri, RDF_TYPE, _iri(specific_type)))
|
||||
|
||||
triples.extend([
|
||||
# Derivation from parent
|
||||
_triple(entity_uri, PROV_WAS_DERIVED_FROM, _iri(parent_uri)),
|
||||
|
||||
|
|
@ -183,7 +199,7 @@ def derived_entity_triples(
|
|||
# Agent declaration
|
||||
_triple(agt_uri, RDF_TYPE, _iri(PROV_AGENT)),
|
||||
_triple(agt_uri, RDFS_LABEL, _literal(component_name)),
|
||||
]
|
||||
])
|
||||
|
||||
if label:
|
||||
triples.append(_triple(entity_uri, RDFS_LABEL, _literal(label)))
|
||||
|
|
@ -209,9 +225,9 @@ def derived_entity_triples(
|
|||
return triples
|
||||
|
||||
|
||||
def triple_provenance_triples(
|
||||
stmt_uri: str,
|
||||
extracted_triple: Triple,
|
||||
def subgraph_provenance_triples(
|
||||
subgraph_uri: str,
|
||||
extracted_triples: List[Triple],
|
||||
chunk_uri: str,
|
||||
component_name: str,
|
||||
component_version: str,
|
||||
|
|
@ -220,16 +236,20 @@ def triple_provenance_triples(
|
|||
timestamp: Optional[str] = None,
|
||||
) -> List[Triple]:
|
||||
"""
|
||||
Build provenance triples for an extracted knowledge triple using reification.
|
||||
Build provenance triples for a subgraph of extracted knowledge.
|
||||
|
||||
One subgraph per chunk extraction, shared across all triples produced
|
||||
from that chunk. This replaces per-triple reification with a
|
||||
containment model.
|
||||
|
||||
Creates:
|
||||
- Reification triple: stmt_uri tg:reifies <<extracted_triple>>
|
||||
- wasDerivedFrom link to source chunk
|
||||
- Activity and agent metadata
|
||||
- tg:contains link for each extracted triple (RDF-star quoted)
|
||||
- One prov:wasDerivedFrom link to source chunk
|
||||
- One activity with agent metadata
|
||||
|
||||
Args:
|
||||
stmt_uri: URI for the reified statement
|
||||
extracted_triple: The extracted Triple to reify
|
||||
subgraph_uri: URI for the extraction subgraph
|
||||
extracted_triples: The extracted Triple objects to include
|
||||
chunk_uri: URI of source chunk
|
||||
component_name: Name of extractor component
|
||||
component_version: Version of the component
|
||||
|
|
@ -238,7 +258,7 @@ def triple_provenance_triples(
|
|||
timestamp: ISO timestamp
|
||||
|
||||
Returns:
|
||||
List of Triple objects for the provenance (including reification)
|
||||
List of Triple objects for the provenance
|
||||
"""
|
||||
if timestamp is None:
|
||||
timestamp = datetime.utcnow().isoformat() + "Z"
|
||||
|
|
@ -246,20 +266,23 @@ def triple_provenance_triples(
|
|||
act_uri = activity_uri()
|
||||
agt_uri = agent_uri(component_name)
|
||||
|
||||
# Create the quoted triple term (RDF-star reification)
|
||||
triple_term = Term(type=TRIPLE, triple=extracted_triple)
|
||||
triples = []
|
||||
|
||||
triples = [
|
||||
# Reification: stmt_uri tg:reifies <<s p o>>
|
||||
Triple(
|
||||
s=_iri(stmt_uri),
|
||||
p=_iri(TG_REIFIES),
|
||||
# Containment: subgraph tg:contains <<s p o>> for each extracted triple
|
||||
for extracted_triple in extracted_triples:
|
||||
triple_term = Term(type=TRIPLE, triple=extracted_triple)
|
||||
triples.append(Triple(
|
||||
s=_iri(subgraph_uri),
|
||||
p=_iri(TG_CONTAINS),
|
||||
o=triple_term
|
||||
),
|
||||
))
|
||||
|
||||
# Statement provenance
|
||||
_triple(stmt_uri, PROV_WAS_DERIVED_FROM, _iri(chunk_uri)),
|
||||
_triple(stmt_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)),
|
||||
# Subgraph provenance
|
||||
triples.extend([
|
||||
_triple(subgraph_uri, RDF_TYPE, _iri(PROV_ENTITY)),
|
||||
_triple(subgraph_uri, RDF_TYPE, _iri(TG_SUBGRAPH_TYPE)),
|
||||
_triple(subgraph_uri, PROV_WAS_DERIVED_FROM, _iri(chunk_uri)),
|
||||
_triple(subgraph_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)),
|
||||
|
||||
# Activity
|
||||
_triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
|
||||
|
|
@ -272,7 +295,7 @@ def triple_provenance_triples(
|
|||
# Agent
|
||||
_triple(agt_uri, RDF_TYPE, _iri(PROV_AGENT)),
|
||||
_triple(agt_uri, RDFS_LABEL, _literal(component_name)),
|
||||
]
|
||||
])
|
||||
|
||||
if llm_model:
|
||||
triples.append(_triple(act_uri, TG_LLM_MODEL, _literal(llm_model)))
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ Child entities (pages, chunks) append path segments to the parent IRI:
|
|||
- Chunk: {page_iri}/c{chunk_index} (from page)
|
||||
{doc_iri}/c{chunk_index} (from text doc)
|
||||
- Activity: https://trustgraph.ai/activity/{uuid}
|
||||
- Statement: https://trustgraph.ai/stmt/{uuid}
|
||||
- Subgraph: https://trustgraph.ai/subgraph/{uuid}
|
||||
"""
|
||||
|
||||
import uuid
|
||||
|
|
@ -50,11 +50,11 @@ def activity_uri(activity_id: str = None) -> str:
|
|||
return f"{TRUSTGRAPH_BASE}/activity/{_encode_id(activity_id)}"
|
||||
|
||||
|
||||
def statement_uri(stmt_id: str = None) -> str:
|
||||
"""Generate URI for a reified statement. Auto-generates UUID if not provided."""
|
||||
if stmt_id is None:
|
||||
stmt_id = str(uuid.uuid4())
|
||||
return f"{TRUSTGRAPH_BASE}/stmt/{_encode_id(stmt_id)}"
|
||||
def subgraph_uri(subgraph_id: str = None) -> str:
|
||||
"""Generate URI for an extraction subgraph. Auto-generates UUID if not provided."""
|
||||
if subgraph_id is None:
|
||||
subgraph_id = str(uuid.uuid4())
|
||||
return f"{TRUSTGRAPH_BASE}/subgraph/{_encode_id(subgraph_id)}"
|
||||
|
||||
|
||||
def agent_uri(component_name: str) -> str:
|
||||
|
|
|
|||
|
|
@ -19,11 +19,12 @@ from . namespaces import (
|
|||
SCHEMA_SUBJECT_OF, SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION,
|
||||
SCHEMA_KEYWORDS, SCHEMA_NAME,
|
||||
SKOS_DEFINITION,
|
||||
TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
|
||||
TG_CONTAINS, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
|
||||
TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
|
||||
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
|
||||
TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
|
||||
TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
|
||||
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -74,9 +75,17 @@ SKOS_LABELS = [
|
|||
_label_triple(SKOS_DEFINITION, "definition"),
|
||||
]
|
||||
|
||||
# TrustGraph class labels (extraction provenance)
|
||||
TG_CLASS_LABELS = [
|
||||
_label_triple(TG_DOCUMENT_TYPE, "Document"),
|
||||
_label_triple(TG_PAGE_TYPE, "Page"),
|
||||
_label_triple(TG_CHUNK_TYPE, "Chunk"),
|
||||
_label_triple(TG_SUBGRAPH_TYPE, "Subgraph"),
|
||||
]
|
||||
|
||||
# TrustGraph predicate labels
|
||||
TG_PREDICATE_LABELS = [
|
||||
_label_triple(TG_REIFIES, "reifies"),
|
||||
_label_triple(TG_CONTAINS, "contains"),
|
||||
_label_triple(TG_PAGE_COUNT, "page count"),
|
||||
_label_triple(TG_MIME_TYPE, "MIME type"),
|
||||
_label_triple(TG_PAGE_NUMBER, "page number"),
|
||||
|
|
@ -116,5 +125,6 @@ def get_vocabulary_triples() -> List[Triple]:
|
|||
DC_PREDICATE_LABELS +
|
||||
SCHEMA_LABELS +
|
||||
SKOS_LABELS +
|
||||
TG_CLASS_LABELS +
|
||||
TG_PREDICATE_LABELS
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue