From 64e3f6bd0d30c64fcde1497de50dffaa6687e12d Mon Sep 17 00:00:00 2001
From: cybermaggedon <cybermaggedon@gmail.com>
Date: Fri, 13 Mar 2026 11:37:59 +0000
Subject: [PATCH] Subgraph provenance (#694)

Replace per-triple provenance reification with subgraph model

Extraction provenance previously created a full reification (statement
URI, activity, agent) for every single extracted triple, producing ~13
provenance triples per knowledge triple.  Since each chunk is processed
by a single LLM call, this was both redundant and semantically
inaccurate.

Now one subgraph object is created per chunk extraction, with
tg:contains linking to each extracted triple.  For 20 extractions from
a chunk this reduces provenance from ~260 triples to ~33.

- Rename tg:reifies -> tg:contains, stmt_uri -> subgraph_uri
- Replace triple_provenance_triples() with subgraph_provenance_triples()
- Refactor kg-extract-definitions and kg-extract-relationships to
  generate provenance once per chunk instead of per triple
- Add subgraph provenance to kg-extract-ontology and kg-extract-agent
  (previously had none)
- Update CLI tools and tech specs to match

Also rename tg-show-document-hierarchy to tg-show-extraction-provenance.

Added extra typing for extraction provenance, fixed extraction prov CLI
---
 .../extraction-provenance-subgraph.md         | 205 ++++++++++++++++++
 docs/tech-specs/extraction-time-provenance.md |  32 +--
 docs/tech-specs/query-time-explainability.md  |   4 +-
 .../test_agent_kg_extraction_integration.py   |   2 +-
 .../test_agent_extraction.py                  |  14 +-
 .../test_agent_extraction_edge_cases.py       |  20 +-
 trustgraph-base/trustgraph/knowledge/defs.py  |   2 +-
 .../trustgraph/provenance/__init__.py         |  20 +-
 .../trustgraph/provenance/namespaces.py       |   8 +-
 .../trustgraph/provenance/triples.py          |  73 ++++---
 trustgraph-base/trustgraph/provenance/uris.py |  12 +-
 .../trustgraph/provenance/vocabulary.py       |  14 +-
 trustgraph-cli/pyproject.toml                 |   2 +-
 .../trustgraph/cli/invoke_graph_rag.py        |   8 +-
 .../trustgraph/cli/show_explain_trace.py      |   6 +-
 ...archy.py => show_extraction_provenance.py} | 108 ++++-----
 .../trustgraph/extract/kg/agent/extract.py    |  43 +++-
 .../extract/kg/definitions/extract.py         |  32 +--
 .../trustgraph/extract/kg/ontology/extract.py |  19 +-
 .../extract/kg/relationships/extract.py       |  32 +--
 20 files changed, 463 insertions(+), 193 deletions(-)
 create mode 100644 docs/tech-specs/extraction-provenance-subgraph.md
 rename trustgraph-cli/trustgraph/cli/{show_document_hierarchy.py => show_extraction_provenance.py} (81%)

diff --git a/docs/tech-specs/extraction-provenance-subgraph.md b/docs/tech-specs/extraction-provenance-subgraph.md
new file mode 100644
index 00000000..ba0d3e50
--- /dev/null
+++ b/docs/tech-specs/extraction-provenance-subgraph.md
@@ -0,0 +1,205 @@
+# Extraction Provenance: Subgraph Model
+
+## Problem
+
+Extraction-time provenance currently generates a full reification per
+extracted triple: a unique `stmt_uri`, `activity_uri`, and associated
+PROV-O metadata for every single knowledge fact.  Processing one chunk
+that yields 20 relationships produces ~220 provenance triples on top of
+the ~20 knowledge triples — a roughly 10:1 overhead.
+
+This is both expensive (storage, indexing, transmission) and semantically
+inaccurate.  Each chunk is processed by a single LLM call that produces
+all its triples in one transaction.  The current per-triple model
+obscures that by creating the illusion of 20 independent extraction
+events.
+
+Additionally, two of the four extraction processors (kg-extract-ontology,
+kg-extract-agent) have no provenance at all, leaving gaps in the audit
+trail.
+
+## Solution
+
+Replace per-triple reification with a **subgraph model**: one provenance
+record per chunk extraction, shared across all triples produced from that
+chunk.
+
+### Terminology Change
+
+| Old | New |
+|-----|-----|
+| `stmt_uri` (`https://trustgraph.ai/stmt/{uuid}`) | `subgraph_uri` (`https://trustgraph.ai/subgraph/{uuid}`) |
+| `statement_uri()` | `subgraph_uri()` |
+| `tg:reifies` (1:1, identity) | `tg:contains` (1:many, containment) |
+
+### Target Structure
+
+All provenance triples go in the `urn:graph:source` named graph.
+
+```
+# Subgraph contains each extracted triple (RDF-star quoted triples)
+<subgraph> tg:contains <<s1 p1 o1>> .
+<subgraph> tg:contains <<s2 p2 o2>> .
+<subgraph> tg:contains <<s3 p3 o3>> .
+
+# Derivation from source chunk
+<subgraph> prov:wasDerivedFrom <chunk_uri> .
+<subgraph> prov:wasGeneratedBy <activity> .
+
+# Activity: one per chunk extraction
+<activity> rdf:type          prov:Activity .
+<activity> rdfs:label        "{component_name} extraction" .
+<activity> prov:used         <chunk_uri> .
+<activity> prov:wasAssociatedWith <agent> .
+<activity> prov:startedAtTime "2026-03-13T10:00:00Z" .
+<activity> tg:componentVersion "0.25.0" .
+<activity> tg:llmModel       "gpt-4" .          # if available
+<activity> tg:ontology        <ontology_uri> .   # if available
+
+# Agent: stable per component
+<agent> rdf:type   prov:Agent .
+<agent> rdfs:label "{component_name}" .
+```
+
+### Volume Comparison
+
+For a chunk producing N extracted triples:
+
+| | Old (per-triple) | New (subgraph) |
+|---|---|---|
+| `tg:contains` / `tg:reifies` | N | N |
+| Activity triples | ~9 x N | ~9 |
+| Agent triples | 2 x N | 2 |
+| Statement/subgraph metadata | 2 x N | 2 |
+| **Total provenance triples** | **~13N** | **N + 13** |
+| **Example (N=20)** | **~260** | **33** |
+
+## Scope
+
+### Processors to Update (existing provenance, per-triple)
+
+**kg-extract-definitions**
+(`trustgraph-flow/trustgraph/extract/kg/definitions/extract.py`)
+
+Currently calls `statement_uri()` + `triple_provenance_triples()` inside
+the per-definition loop.
+
+Changes:
+- Move `subgraph_uri()` and `activity_uri()` creation before the loop
+- Collect `tg:contains` triples inside the loop
+- Emit shared activity/agent/derivation block once after the loop
+
+**kg-extract-relationships**
+(`trustgraph-flow/trustgraph/extract/kg/relationships/extract.py`)
+
+Same pattern as definitions.  Same changes.
+
+### Processors to Add Provenance (currently missing)
+
+**kg-extract-ontology**
+(`trustgraph-flow/trustgraph/extract/kg/ontology/extract.py`)
+
+Currently emits triples with no provenance.  Add subgraph provenance
+using the same pattern: one subgraph per chunk, `tg:contains` for each
+extracted triple.
+
+**kg-extract-agent**
+(`trustgraph-flow/trustgraph/extract/kg/agent/extract.py`)
+
+Currently emits triples with no provenance.  Add subgraph provenance
+using the same pattern.
+
+### Shared Provenance Library Changes
+
+**`trustgraph-base/trustgraph/provenance/triples.py`**
+
+- Replace `triple_provenance_triples()` with `subgraph_provenance_triples()`
+- New function accepts a list of extracted triples instead of a single one
+- Generates one `tg:contains` per triple, shared activity/agent block
+- Remove old `triple_provenance_triples()`
+
+**`trustgraph-base/trustgraph/provenance/uris.py`**
+
+- Replace `statement_uri()` with `subgraph_uri()`
+
+**`trustgraph-base/trustgraph/provenance/namespaces.py`**
+
+- Replace `TG_REIFIES` with `TG_CONTAINS`
+
+### Not in Scope
+
+- **kg-extract-topics**: older-style processor, not currently used in
+  standard flows
+- **kg-extract-rows**: produces rows not triples, different provenance
+  model
+- **Query-time provenance** (`urn:graph:retrieval`): separate concern,
+  already uses a different pattern (question/exploration/focus/synthesis)
+- **Document/page/chunk provenance** (PDF decoder, chunker): already uses
+  `derived_entity_triples()` which is per-entity, not per-triple — no
+  redundancy issue
+
+## Implementation Notes
+
+### Processor Loop Restructure
+
+Before (per-triple, in relationships):
+```python
+for rel in rels:
+    # ... build relationship_triple ...
+    stmt_uri = statement_uri()
+    prov_triples = triple_provenance_triples(
+        stmt_uri=stmt_uri,
+        extracted_triple=relationship_triple,
+        ...
+    )
+    triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
+```
+
+After (subgraph):
+```python
+sg_uri = subgraph_uri()
+
+for rel in rels:
+    # ... build relationship_triple ...
+    extracted_triples.append(relationship_triple)
+
+prov_triples = subgraph_provenance_triples(
+    subgraph_uri=sg_uri,
+    extracted_triples=extracted_triples,
+    chunk_uri=chunk_uri,
+    component_name=default_ident,
+    component_version=COMPONENT_VERSION,
+    llm_model=llm_model,
+    ontology_uri=ontology_uri,
+)
+triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
+```
+
+### New Helper Signature
+
+```python
+def subgraph_provenance_triples(
+    subgraph_uri: str,
+    extracted_triples: List[Triple],
+    chunk_uri: str,
+    component_name: str,
+    component_version: str,
+    llm_model: Optional[str] = None,
+    ontology_uri: Optional[str] = None,
+    timestamp: Optional[str] = None,
+) -> List[Triple]:
+    """
+    Build provenance triples for a subgraph of extracted knowledge.
+
+    Creates:
+    - tg:contains link for each extracted triple (RDF-star quoted)
+    - One prov:wasDerivedFrom link to source chunk
+    - One activity with agent metadata
+    """
+```
+
+### Breaking Change
+
+This is a breaking change to the provenance model.  Provenance has not
+been released, so no migration is needed.  The old `tg:reifies` /
+`statement_uri` code can be removed outright.
diff --git a/docs/tech-specs/extraction-time-provenance.md b/docs/tech-specs/extraction-time-provenance.md
index 72c1c971..d70c8c11 100644
--- a/docs/tech-specs/extraction-time-provenance.md
+++ b/docs/tech-specs/extraction-time-provenance.md
@@ -311,10 +311,10 @@ activity:chunk-789 tg:chunkOverlap 200 .
 # The extracted triple (edge)
 entity:JohnSmith rel:worksAt entity:AcmeCorp .
 
-# Statement object pointing at the edge (RDF 1.2 reification)
-stmt:001 tg:reifies <<entity:JohnSmith rel:worksAt entity:AcmeCorp>> .
-stmt:001 prov:wasDerivedFrom chunk:123-1-1 .
-stmt:001 prov:wasGeneratedBy activity:extract-999 .
+# Subgraph containing the extracted triples
+subgraph:001 tg:contains <<entity:JohnSmith rel:worksAt entity:AcmeCorp>> .
+subgraph:001 prov:wasDerivedFrom chunk:123-1-1 .
+subgraph:001 prov:wasGeneratedBy activity:extract-999 .
 
 activity:extract-999 a prov:Activity .
 activity:extract-999 prov:used chunk:123-1-1 .
@@ -344,7 +344,7 @@ Custom predicates under the `tg:` namespace for extraction-specific metadata:
 
 | Predicate | Domain | Description |
 |-----------|--------|-------------|
-| `tg:reifies` | Statement | Points at the triple this statement object represents |
+| `tg:contains` | Subgraph | Points at a triple contained in this extraction subgraph |
 | `tg:pageCount` | Document | Total number of pages in source document |
 | `tg:mimeType` | Document | MIME type of source document |
 | `tg:pageNumber` | Page | Page number in source document |
@@ -383,7 +383,7 @@ prov:startedAtTime rdfs:label "started at" .
 
 **TrustGraph Predicates:**
 ```
-tg:reifies rdfs:label "reifies" .
+tg:contains rdfs:label "contains" .
 tg:pageCount rdfs:label "page count" .
 tg:mimeType rdfs:label "MIME type" .
 tg:pageNumber rdfs:label "page number" .
@@ -416,20 +416,20 @@ For finer-grained provenance, it would be valuable to record exactly where withi
 # The extracted triple
 entity:JohnSmith rel:worksAt entity:AcmeCorp .
 
-# Statement with sub-chunk provenance
-stmt:001 tg:reifies <<entity:JohnSmith rel:worksAt entity:AcmeCorp>> .
-stmt:001 prov:wasDerivedFrom chunk:123-1-1 .
-stmt:001 tg:sourceText "John Smith has worked at Acme Corp since 2019" .
-stmt:001 tg:sourceCharOffset 1547 .
-stmt:001 tg:sourceCharLength 46 .
+# Subgraph with sub-chunk provenance
+subgraph:001 tg:contains <<entity:JohnSmith rel:worksAt entity:AcmeCorp>> .
+subgraph:001 prov:wasDerivedFrom chunk:123-1-1 .
+subgraph:001 tg:sourceText "John Smith has worked at Acme Corp since 2019" .
+subgraph:001 tg:sourceCharOffset 1547 .
+subgraph:001 tg:sourceCharLength 46 .
 ```
 
 **Example with text range (alternative):**
 ```
-stmt:001 tg:reifies <<entity:JohnSmith rel:worksAt entity:AcmeCorp>> .
-stmt:001 prov:wasDerivedFrom chunk:123-1-1 .
-stmt:001 tg:sourceRange "1547-1593" .
-stmt:001 tg:sourceText "John Smith has worked at Acme Corp since 2019" .
+subgraph:001 tg:contains <<entity:JohnSmith rel:worksAt entity:AcmeCorp>> .
+subgraph:001 prov:wasDerivedFrom chunk:123-1-1 .
+subgraph:001 tg:sourceRange "1547-1593" .
+subgraph:001 tg:sourceText "John Smith has worked at Acme Corp since 2019" .
 ```
 
 **Implementation considerations:**
diff --git a/docs/tech-specs/query-time-explainability.md b/docs/tech-specs/query-time-explainability.md
index 3385efc9..e696745c 100644
--- a/docs/tech-specs/query-time-explainability.md
+++ b/docs/tech-specs/query-time-explainability.md
@@ -193,7 +193,7 @@ When storing explainability data, URIs from `uri_map` are used.
 
 Selected edges can be traced back to source documents:
 
-1. Query for reifying statement: `?stmt tg:reifies <<s p o>>`
+1. Query for containing subgraph: `?subgraph tg:contains <<s p o>>`
 2. Follow `prov:wasDerivedFrom` chain to root document
 3. Each step in chain: chunk → page → document
 
@@ -209,7 +209,7 @@ elif term.type == TRIPLE:
 
 This enables queries like:
 ```
-?stmt tg:reifies <<http://example.org/s http://example.org/p "value">>
+?subgraph tg:contains <<http://example.org/s http://example.org/p "value">>
 ```
 
 ## CLI Usage
diff --git a/tests/integration/test_agent_kg_extraction_integration.py b/tests/integration/test_agent_kg_extraction_integration.py
index c97bd529..a2576274 100644
--- a/tests/integration/test_agent_kg_extraction_integration.py
+++ b/tests/integration/test_agent_kg_extraction_integration.py
@@ -128,7 +128,7 @@ class TestAgentKgExtractionIntegration:
             
             # Parse and process
             extraction_data = extractor.parse_jsonl(agent_response)
-            triples, entity_contexts = extractor.process_extraction_data(extraction_data, v.metadata)
+            triples, entity_contexts, extracted_triples = extractor.process_extraction_data(extraction_data, v.metadata)
 
             # Emit outputs
             if triples:
diff --git a/tests/unit/test_knowledge_graph/test_agent_extraction.py b/tests/unit/test_knowledge_graph/test_agent_extraction.py
index e4c723db..d2824c0c 100644
--- a/tests/unit/test_knowledge_graph/test_agent_extraction.py
+++ b/tests/unit/test_knowledge_graph/test_agent_extraction.py
@@ -168,7 +168,7 @@ This is not JSON at all
             }
         ]
 
-        triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
+        triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
         
         # Check entity label triple
         label_triple = next((t for t in triples if t.p.iri == RDF_LABEL and t.o.value == "Machine Learning"), None)
@@ -206,7 +206,7 @@ This is not JSON at all
             }
         ]
 
-        triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
+        triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
         
         # Check that subject, predicate, and object labels are created
         subject_uri = f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
@@ -244,7 +244,7 @@ This is not JSON at all
             }
         ]
 
-        triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
+        triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
 
         # Check that object labels are not created for literal objects
         object_labels = [t for t in triples if t.p.iri == RDF_LABEL and t.o.value == "95%"]
@@ -253,7 +253,7 @@ This is not JSON at all
 
     def test_process_extraction_data_combined(self, agent_extractor, sample_metadata, sample_extraction_data):
         """Test processing of combined definitions and relationships"""
-        triples, entity_contexts = agent_extractor.process_extraction_data(sample_extraction_data, sample_metadata)
+        triples, entity_contexts, _ = agent_extractor.process_extraction_data(sample_extraction_data, sample_metadata)
         
         # Check that we have both definition and relationship triples
         definition_triples = [t for t in triples if t.p.iri == DEFINITION]
@@ -272,7 +272,7 @@ This is not JSON at all
             {"type": "definition", "entity": "Test Entity", "definition": "Test definition"}
         ]
 
-        triples, entity_contexts = agent_extractor.process_extraction_data(data, metadata)
+        triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, metadata)
 
         # Should not create subject-of relationships when no metadata ID
         subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
@@ -285,7 +285,7 @@ This is not JSON at all
         """Test processing of empty extraction data"""
         data = []
 
-        triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
+        triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
 
         # Should have no entity contexts
         assert len(entity_contexts) == 0
@@ -300,7 +300,7 @@ This is not JSON at all
             {"type": "relationship", "subject": "A", "predicate": "rel", "object": "B", "object-entity": True}
         ]
 
-        triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
+        triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
 
         # Should process valid items and ignore unknown types
         assert len(entity_contexts) == 1  # Only the definition creates entity context
diff --git a/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py b/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py
index 2f0174e5..ac20fe11 100644
--- a/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py
+++ b/tests/unit/test_knowledge_graph/test_agent_extraction_edge_cases.py
@@ -168,7 +168,7 @@ class TestAgentKgExtractionEdgeCases:
         """Test processing with empty or minimal metadata"""
         # Test with None metadata - may not raise AttributeError depending on implementation
         try:
-            triples, contexts = agent_extractor.process_extraction_data([], None)
+            triples, contexts, _ = agent_extractor.process_extraction_data([], None)
             # If it doesn't raise, check the results
             assert len(triples) == 0
             assert len(contexts) == 0
@@ -178,14 +178,14 @@ class TestAgentKgExtractionEdgeCases:
 
         # Test with metadata without ID
         metadata = Metadata(id=None)
-        triples, contexts = agent_extractor.process_extraction_data([], metadata)
+        triples, contexts, _ = agent_extractor.process_extraction_data([], metadata)
         assert len(triples) == 0
         assert len(contexts) == 0
 
         # Test with metadata with empty string ID
         metadata = Metadata(id="")
         data = [{"type": "definition", "entity": "Test", "definition": "Test def"}]
-        triples, contexts = agent_extractor.process_extraction_data(data, metadata)
+        triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
 
         # Should not create subject-of triples when ID is empty string
         subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
@@ -213,7 +213,7 @@ class TestAgentKgExtractionEdgeCases:
             for entity in special_entities
         ]
 
-        triples, contexts = agent_extractor.process_extraction_data(data, metadata)
+        triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
 
         # Verify all entities were processed
         assert len(contexts) == len(special_entities)
@@ -234,7 +234,7 @@ class TestAgentKgExtractionEdgeCases:
             {"type": "definition", "entity": "Test Entity", "definition": long_definition}
         ]
 
-        triples, contexts = agent_extractor.process_extraction_data(data, metadata)
+        triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
 
         # Should handle long definitions without issues
         assert len(contexts) == 1
@@ -256,7 +256,7 @@ class TestAgentKgExtractionEdgeCases:
             {"type": "definition", "entity": "AI", "definition": "Another AI definition"},  # Duplicate
         ]
 
-        triples, contexts = agent_extractor.process_extraction_data(data, metadata)
+        triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
 
         # Should process all entries (including duplicates)
         assert len(contexts) == 4
@@ -280,7 +280,7 @@ class TestAgentKgExtractionEdgeCases:
             {"type": "relationship", "subject": "test", "predicate": "test", "object": "", "object-entity": True},
         ]
 
-        triples, contexts = agent_extractor.process_extraction_data(data, metadata)
+        triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
 
         # Should handle empty strings by creating URIs (even if empty)
         assert len(contexts) == 3
@@ -306,7 +306,7 @@ class TestAgentKgExtractionEdgeCases:
             }
         ]
 
-        triples, contexts = agent_extractor.process_extraction_data(data, metadata)
+        triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
 
         # Should handle JSON strings in definitions without parsing them
         assert len(contexts) == 2
@@ -334,7 +334,7 @@ class TestAgentKgExtractionEdgeCases:
             {"type": "relationship", "subject": "A", "predicate": "rel7", "object": "F", "object-entity": 1},
         ]
 
-        triples, contexts = agent_extractor.process_extraction_data(data, metadata)
+        triples, contexts, _ = agent_extractor.process_extraction_data(data, metadata)
 
         # Should process all relationships
         # Note: The current implementation has some logic issues that these tests document
@@ -416,7 +416,7 @@ class TestAgentKgExtractionEdgeCases:
         import time
         start_time = time.time()
 
-        triples, contexts = agent_extractor.process_extraction_data(large_data, metadata)
+        triples, contexts, _ = agent_extractor.process_extraction_data(large_data, metadata)
 
         end_time = time.time()
         processing_time = end_time - start_time
diff --git a/trustgraph-base/trustgraph/knowledge/defs.py b/trustgraph-base/trustgraph/knowledge/defs.py
index e8157586..4c7eb41a 100644
--- a/trustgraph-base/trustgraph/knowledge/defs.py
+++ b/trustgraph-base/trustgraph/knowledge/defs.py
@@ -41,7 +41,7 @@ class QuotedTriple:
     enabling statements about statements.
 
     Example:
-        # stmt:123 tg:reifies <<:Hope skos:definition "A feeling...">>
+        # subgraph:123 tg:contains <<:Hope skos:definition "A feeling...">>
         qt = QuotedTriple(
             s=Uri("https://example.org/Hope"),
             p=Uri("http://www.w3.org/2004/02/skos/core#definition"),
diff --git a/trustgraph-base/trustgraph/provenance/__init__.py b/trustgraph-base/trustgraph/provenance/__init__.py
index b22f44d8..df3c2034 100644
--- a/trustgraph-base/trustgraph/provenance/__init__.py
+++ b/trustgraph-base/trustgraph/provenance/__init__.py
@@ -2,7 +2,7 @@
 Provenance module for extraction-time provenance support.
 
 Provides helpers for:
-- URI generation for documents, pages, chunks, activities, statements
+- URI generation for documents, pages, chunks, activities, subgraphs
 - PROV-O triple building for provenance metadata
 - Vocabulary bootstrap for per-collection initialization
 
@@ -38,7 +38,7 @@ from . uris import (
     chunk_uri_from_page,
     chunk_uri_from_doc,
     activity_uri,
-    statement_uri,
+    subgraph_uri,
     agent_uri,
     # Query-time provenance URIs (GraphRAG)
     question_uri,
@@ -66,11 +66,13 @@ from . namespaces import (
     # RDF/RDFS
     RDF, RDF_TYPE, RDFS, RDFS_LABEL,
     # TrustGraph
-    TG, TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
+    TG, TG_CONTAINS, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
     TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
     TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
     TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
     TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
+    # Extraction provenance entity types
+    TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
     # Query-time provenance predicates (GraphRAG)
     TG_QUERY, TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_REASONING, TG_CONTENT,
     # Query-time provenance predicates (DocumentRAG)
@@ -94,7 +96,7 @@ from . namespaces import (
 from . triples import (
     document_triples,
     derived_entity_triples,
-    triple_provenance_triples,
+    subgraph_provenance_triples,
     # Query-time provenance triple builders (GraphRAG)
     question_triples,
     exploration_triples,
@@ -121,6 +123,7 @@ from . vocabulary import (
     PROV_CLASS_LABELS,
     PROV_PREDICATE_LABELS,
     DC_PREDICATE_LABELS,
+    TG_CLASS_LABELS,
     TG_PREDICATE_LABELS,
 )
 
@@ -132,7 +135,7 @@ __all__ = [
     "chunk_uri_from_page",
     "chunk_uri_from_doc",
     "activity_uri",
-    "statement_uri",
+    "subgraph_uri",
     "agent_uri",
     # Query-time provenance URIs
     "question_uri",
@@ -153,11 +156,13 @@ __all__ = [
     "PROV_USED", "PROV_WAS_ASSOCIATED_WITH", "PROV_STARTED_AT_TIME",
     "DC", "DC_TITLE", "DC_SOURCE", "DC_DATE", "DC_CREATOR",
     "RDF", "RDF_TYPE", "RDFS", "RDFS_LABEL",
-    "TG", "TG_REIFIES", "TG_PAGE_COUNT", "TG_MIME_TYPE", "TG_PAGE_NUMBER",
+    "TG", "TG_CONTAINS", "TG_PAGE_COUNT", "TG_MIME_TYPE", "TG_PAGE_NUMBER",
     "TG_CHUNK_INDEX", "TG_CHAR_OFFSET", "TG_CHAR_LENGTH",
     "TG_CHUNK_SIZE", "TG_CHUNK_OVERLAP", "TG_COMPONENT_VERSION",
     "TG_LLM_MODEL", "TG_ONTOLOGY", "TG_EMBEDDING_MODEL",
     "TG_SOURCE_TEXT", "TG_SOURCE_CHAR_OFFSET", "TG_SOURCE_CHAR_LENGTH",
+    # Extraction provenance entity types
+    "TG_DOCUMENT_TYPE", "TG_PAGE_TYPE", "TG_CHUNK_TYPE", "TG_SUBGRAPH_TYPE",
     # Query-time provenance predicates (GraphRAG)
     "TG_QUERY", "TG_EDGE_COUNT", "TG_SELECTED_EDGE", "TG_REASONING", "TG_CONTENT",
     # Query-time provenance predicates (DocumentRAG)
@@ -178,7 +183,7 @@ __all__ = [
     # Triple builders
     "document_triples",
     "derived_entity_triples",
-    "triple_provenance_triples",
+    "subgraph_provenance_triples",
     # Query-time provenance triple builders (GraphRAG)
     "question_triples",
     "exploration_triples",
@@ -199,5 +204,6 @@ __all__ = [
     "PROV_CLASS_LABELS",
     "PROV_PREDICATE_LABELS",
     "DC_PREDICATE_LABELS",
+    "TG_CLASS_LABELS",
     "TG_PREDICATE_LABELS",
 ]
diff --git a/trustgraph-base/trustgraph/provenance/namespaces.py b/trustgraph-base/trustgraph/provenance/namespaces.py
index 91e82eac..e60dee16 100644
--- a/trustgraph-base/trustgraph/provenance/namespaces.py
+++ b/trustgraph-base/trustgraph/provenance/namespaces.py
@@ -42,7 +42,7 @@ SKOS_DEFINITION = SKOS + "definition"
 
 # TrustGraph namespace for custom predicates
 TG = "https://trustgraph.ai/ns/"
-TG_REIFIES = TG + "reifies"
+TG_CONTAINS = TG + "contains"
 TG_PAGE_COUNT = TG + "pageCount"
 TG_MIME_TYPE = TG + "mimeType"
 TG_PAGE_NUMBER = TG + "pageNumber"
@@ -72,6 +72,12 @@ TG_DOCUMENT = TG + "document"  # Reference to document in librarian
 TG_CHUNK_COUNT = TG + "chunkCount"
 TG_SELECTED_CHUNK = TG + "selectedChunk"
 
+# Extraction provenance entity types
+TG_DOCUMENT_TYPE = TG + "Document"
+TG_PAGE_TYPE = TG + "Page"
+TG_CHUNK_TYPE = TG + "Chunk"
+TG_SUBGRAPH_TYPE = TG + "Subgraph"
+
 # Explainability entity types (shared)
 TG_QUESTION = TG + "Question"
 TG_EXPLORATION = TG + "Exploration"
diff --git a/trustgraph-base/trustgraph/provenance/triples.py b/trustgraph-base/trustgraph/provenance/triples.py
index 14581c6f..459783d1 100644
--- a/trustgraph-base/trustgraph/provenance/triples.py
+++ b/trustgraph-base/trustgraph/provenance/triples.py
@@ -16,7 +16,9 @@ from . namespaces import (
     TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
     TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
     TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
-    TG_LLM_MODEL, TG_ONTOLOGY, TG_REIFIES,
+    TG_LLM_MODEL, TG_ONTOLOGY, TG_CONTAINS,
+    # Extraction provenance entity types
+    TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
     # Query-time provenance predicates (GraphRAG)
     TG_QUERY, TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_EDGE, TG_REASONING, TG_CONTENT,
     TG_DOCUMENT,
@@ -28,7 +30,7 @@ from . namespaces import (
     TG_GRAPH_RAG_QUESTION, TG_DOC_RAG_QUESTION,
 )
 
-from . uris import activity_uri, agent_uri, edge_selection_uri
+from . uris import activity_uri, agent_uri, subgraph_uri, edge_selection_uri
 
 
 def set_graph(triples: List[Triple], graph: str) -> List[Triple]:
@@ -92,6 +94,7 @@ def document_triples(
     """
     triples = [
         _triple(doc_uri, RDF_TYPE, _iri(PROV_ENTITY)),
+        _triple(doc_uri, RDF_TYPE, _iri(TG_DOCUMENT_TYPE)),
     ]
 
     if title:
@@ -162,10 +165,23 @@ def derived_entity_triples(
     act_uri = activity_uri()
     agt_uri = agent_uri(component_name)
 
+    # Determine specific type from parameters
+    if page_number is not None:
+        specific_type = TG_PAGE_TYPE
+    elif chunk_index is not None:
+        specific_type = TG_CHUNK_TYPE
+    else:
+        specific_type = None
+
     triples = [
         # Entity declaration
         _triple(entity_uri, RDF_TYPE, _iri(PROV_ENTITY)),
+    ]
 
+    if specific_type:
+        triples.append(_triple(entity_uri, RDF_TYPE, _iri(specific_type)))
+
+    triples.extend([
         # Derivation from parent
         _triple(entity_uri, PROV_WAS_DERIVED_FROM, _iri(parent_uri)),
 
@@ -183,7 +199,7 @@ def derived_entity_triples(
         # Agent declaration
         _triple(agt_uri, RDF_TYPE, _iri(PROV_AGENT)),
         _triple(agt_uri, RDFS_LABEL, _literal(component_name)),
-    ]
+    ])
 
     if label:
         triples.append(_triple(entity_uri, RDFS_LABEL, _literal(label)))
@@ -209,9 +225,9 @@ def derived_entity_triples(
     return triples
 
 
-def triple_provenance_triples(
-    stmt_uri: str,
-    extracted_triple: Triple,
+def subgraph_provenance_triples(
+    subgraph_uri: str,
+    extracted_triples: List[Triple],
     chunk_uri: str,
     component_name: str,
     component_version: str,
@@ -220,16 +236,20 @@ def triple_provenance_triples(
     timestamp: Optional[str] = None,
 ) -> List[Triple]:
     """
-    Build provenance triples for an extracted knowledge triple using reification.
+    Build provenance triples for a subgraph of extracted knowledge.
+
+    One subgraph per chunk extraction, shared across all triples produced
+    from that chunk.  This replaces per-triple reification with a
+    containment model.
 
     Creates:
-    - Reification triple: stmt_uri tg:reifies <<extracted_triple>>
-    - wasDerivedFrom link to source chunk
-    - Activity and agent metadata
+    - tg:contains link for each extracted triple (RDF-star quoted)
+    - One prov:wasDerivedFrom link to source chunk
+    - One activity with agent metadata
 
     Args:
-        stmt_uri: URI for the reified statement
-        extracted_triple: The extracted Triple to reify
+        subgraph_uri: URI for the extraction subgraph
+        extracted_triples: The extracted Triple objects to include
         chunk_uri: URI of source chunk
         component_name: Name of extractor component
         component_version: Version of the component
@@ -238,7 +258,7 @@ def triple_provenance_triples(
         timestamp: ISO timestamp
 
     Returns:
-        List of Triple objects for the provenance (including reification)
+        List of Triple objects for the provenance
     """
     if timestamp is None:
         timestamp = datetime.utcnow().isoformat() + "Z"
@@ -246,20 +266,23 @@ def triple_provenance_triples(
     act_uri = activity_uri()
     agt_uri = agent_uri(component_name)
 
-    # Create the quoted triple term (RDF-star reification)
-    triple_term = Term(type=TRIPLE, triple=extracted_triple)
+    triples = []
 
-    triples = [
-        # Reification: stmt_uri tg:reifies <<s p o>>
-        Triple(
-            s=_iri(stmt_uri),
-            p=_iri(TG_REIFIES),
+    # Containment: subgraph tg:contains <<s p o>> for each extracted triple
+    for extracted_triple in extracted_triples:
+        triple_term = Term(type=TRIPLE, triple=extracted_triple)
+        triples.append(Triple(
+            s=_iri(subgraph_uri),
+            p=_iri(TG_CONTAINS),
             o=triple_term
-        ),
+        ))
 
-        # Statement provenance
-        _triple(stmt_uri, PROV_WAS_DERIVED_FROM, _iri(chunk_uri)),
-        _triple(stmt_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)),
+    # Subgraph provenance
+    triples.extend([
+        _triple(subgraph_uri, RDF_TYPE, _iri(PROV_ENTITY)),
+        _triple(subgraph_uri, RDF_TYPE, _iri(TG_SUBGRAPH_TYPE)),
+        _triple(subgraph_uri, PROV_WAS_DERIVED_FROM, _iri(chunk_uri)),
+        _triple(subgraph_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)),
 
         # Activity
         _triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
@@ -272,7 +295,7 @@ def triple_provenance_triples(
         # Agent
         _triple(agt_uri, RDF_TYPE, _iri(PROV_AGENT)),
         _triple(agt_uri, RDFS_LABEL, _literal(component_name)),
-    ]
+    ])
 
     if llm_model:
         triples.append(_triple(act_uri, TG_LLM_MODEL, _literal(llm_model)))
diff --git a/trustgraph-base/trustgraph/provenance/uris.py b/trustgraph-base/trustgraph/provenance/uris.py
index b14abe76..0f8d3136 100644
--- a/trustgraph-base/trustgraph/provenance/uris.py
+++ b/trustgraph-base/trustgraph/provenance/uris.py
@@ -8,7 +8,7 @@ Child entities (pages, chunks) append path segments to the parent IRI:
 - Chunk:     {page_iri}/c{chunk_index} (from page)
              {doc_iri}/c{chunk_index} (from text doc)
 - Activity:  https://trustgraph.ai/activity/{uuid}
-- Statement: https://trustgraph.ai/stmt/{uuid}
+- Subgraph:  https://trustgraph.ai/subgraph/{uuid}
 """
 
 import uuid
@@ -50,11 +50,11 @@ def activity_uri(activity_id: str = None) -> str:
     return f"{TRUSTGRAPH_BASE}/activity/{_encode_id(activity_id)}"
 
 
-def statement_uri(stmt_id: str = None) -> str:
-    """Generate URI for a reified statement. Auto-generates UUID if not provided."""
-    if stmt_id is None:
-        stmt_id = str(uuid.uuid4())
-    return f"{TRUSTGRAPH_BASE}/stmt/{_encode_id(stmt_id)}"
+def subgraph_uri(subgraph_id: str = None) -> str:
+    """Generate URI for an extraction subgraph. Auto-generates UUID if not provided."""
+    if subgraph_id is None:
+        subgraph_id = str(uuid.uuid4())
+    return f"{TRUSTGRAPH_BASE}/subgraph/{_encode_id(subgraph_id)}"
 
 
 def agent_uri(component_name: str) -> str:
diff --git a/trustgraph-base/trustgraph/provenance/vocabulary.py b/trustgraph-base/trustgraph/provenance/vocabulary.py
index 47164da8..f9ae1abd 100644
--- a/trustgraph-base/trustgraph/provenance/vocabulary.py
+++ b/trustgraph-base/trustgraph/provenance/vocabulary.py
@@ -19,11 +19,12 @@ from . namespaces import (
     SCHEMA_SUBJECT_OF, SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION,
     SCHEMA_KEYWORDS, SCHEMA_NAME,
     SKOS_DEFINITION,
-    TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
+    TG_CONTAINS, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
     TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
     TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
     TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
     TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
+    TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
 )
 
 
@@ -74,9 +75,17 @@ SKOS_LABELS = [
     _label_triple(SKOS_DEFINITION, "definition"),
 ]
 
+# TrustGraph class labels (extraction provenance)
+TG_CLASS_LABELS = [
+    _label_triple(TG_DOCUMENT_TYPE, "Document"),
+    _label_triple(TG_PAGE_TYPE, "Page"),
+    _label_triple(TG_CHUNK_TYPE, "Chunk"),
+    _label_triple(TG_SUBGRAPH_TYPE, "Subgraph"),
+]
+
 # TrustGraph predicate labels
 TG_PREDICATE_LABELS = [
-    _label_triple(TG_REIFIES, "reifies"),
+    _label_triple(TG_CONTAINS, "contains"),
     _label_triple(TG_PAGE_COUNT, "page count"),
     _label_triple(TG_MIME_TYPE, "MIME type"),
     _label_triple(TG_PAGE_NUMBER, "page number"),
@@ -116,5 +125,6 @@ def get_vocabulary_triples() -> List[Triple]:
         DC_PREDICATE_LABELS +
         SCHEMA_LABELS +
         SKOS_LABELS +
+        TG_CLASS_LABELS +
         TG_PREDICATE_LABELS
     )
diff --git a/trustgraph-cli/pyproject.toml b/trustgraph-cli/pyproject.toml
index f2ad2ac8..fb3402c9 100644
--- a/trustgraph-cli/pyproject.toml
+++ b/trustgraph-cli/pyproject.toml
@@ -96,7 +96,7 @@ tg-delete-config-item = "trustgraph.cli.delete_config_item:main"
 tg-list-collections = "trustgraph.cli.list_collections:main"
 tg-set-collection = "trustgraph.cli.set_collection:main"
 tg-delete-collection = "trustgraph.cli.delete_collection:main"
-tg-show-document-hierarchy = "trustgraph.cli.show_document_hierarchy:main"
+tg-show-extraction-provenance = "trustgraph.cli.show_extraction_provenance:main"
 tg-list-explain-traces = "trustgraph.cli.list_explain_traces:main"
 tg-show-explain-trace = "trustgraph.cli.show_explain_trace:main"
 
diff --git a/trustgraph-cli/trustgraph/cli/invoke_graph_rag.py b/trustgraph-cli/trustgraph/cli/invoke_graph_rag.py
index 27c6854d..295df0b9 100644
--- a/trustgraph-cli/trustgraph/cli/invoke_graph_rag.py
+++ b/trustgraph-cli/trustgraph/cli/invoke_graph_rag.py
@@ -36,7 +36,7 @@ TG_SELECTED_EDGE = TG + "selectedEdge"
 TG_EDGE = TG + "edge"
 TG_REASONING = TG + "reasoning"
 TG_CONTENT = TG + "content"
-TG_REIFIES = TG + "reifies"
+TG_CONTAINS = TG + "contains"
 PROV = "http://www.w3.org/ns/prov#"
 PROV_STARTED_AT_TIME = PROV + "startedAtTime"
 PROV_WAS_DERIVED_FROM = PROV + "wasDerivedFrom"
@@ -185,18 +185,18 @@ async def _query_edge_provenance(ws_url, flow_id, edge_s, edge_p, edge_o, user,
     """
     Query for provenance of an edge (s, p, o) in the knowledge graph.
 
-    Finds statements that reify the edge via tg:reifies, then follows
+    Finds subgraphs that contain the edge via tg:contains, then follows
     prov:wasDerivedFrom to find source documents.
 
     Returns list of source URIs (chunks, pages, documents).
     """
-    # Query for statements that reify this edge: ?stmt tg:reifies <<s p o>>
+    # Query for subgraphs that contain this edge: ?subgraph tg:contains <<s p o>>
     request = {
         "id": "edge-prov-request",
         "service": "triples",
         "flow": flow_id,
         "request": {
-            "p": {"t": "i", "i": TG_REIFIES},
+            "p": {"t": "i", "i": TG_CONTAINS},
             "o": {
                 "t": "t",  # Quoted triple type
                 "tr": {
diff --git a/trustgraph-cli/trustgraph/cli/show_explain_trace.py b/trustgraph-cli/trustgraph/cli/show_explain_trace.py
index b3fc7058..d7392b99 100644
--- a/trustgraph-cli/trustgraph/cli/show_explain_trace.py
+++ b/trustgraph-cli/trustgraph/cli/show_explain_trace.py
@@ -40,7 +40,7 @@ SOURCE_GRAPH = "urn:graph:source"
 
 # Provenance predicates for edge tracing
 TG = "https://trustgraph.ai/ns/"
-TG_REIFIES = TG + "reifies"
+TG_CONTAINS = TG + "contains"
 PROV = "http://www.w3.org/ns/prov#"
 PROV_WAS_DERIVED_FROM = PROV + "wasDerivedFrom"
 
@@ -79,10 +79,10 @@ def trace_edge_provenance(flow, user, collection, edge, label_cache, explain_cli
         }
     }
 
-    # Query: ?stmt tg:reifies <<edge>>
+    # Query: ?subgraph tg:contains <<edge>>
     try:
         results = flow.triples_query(
-            p=TG_REIFIES,
+            p=TG_CONTAINS,
             o=quoted_triple,
             g=SOURCE_GRAPH,
             user=user,
diff --git a/trustgraph-cli/trustgraph/cli/show_document_hierarchy.py b/trustgraph-cli/trustgraph/cli/show_extraction_provenance.py
similarity index 81%
rename from trustgraph-cli/trustgraph/cli/show_document_hierarchy.py
rename to trustgraph-cli/trustgraph/cli/show_extraction_provenance.py
index be83efd5..4f87712c 100644
--- a/trustgraph-cli/trustgraph/cli/show_document_hierarchy.py
+++ b/trustgraph-cli/trustgraph/cli/show_extraction_provenance.py
@@ -1,12 +1,12 @@
 """
-Show document hierarchy: Document -> Pages -> Chunks -> Edges.
+Show extraction provenance: Document -> Pages -> Chunks -> Edges.
 
 Given a document ID, traverses and displays all derived entities
 (pages, chunks, extracted edges) using prov:wasDerivedFrom relationships.
 
 Examples:
-  tg-show-document-hierarchy -U trustgraph -C default "urn:trustgraph:doc:abc123"
-  tg-show-document-hierarchy --show-content --max-content 500 "urn:trustgraph:doc:abc123"
+  tg-show-extraction-provenance -U trustgraph -C default "urn:trustgraph:doc:abc123"
+  tg-show-extraction-provenance --show-content --max-content 500 "urn:trustgraph:doc:abc123"
 """
 
 import argparse
@@ -25,10 +25,22 @@ PROV_WAS_DERIVED_FROM = "http://www.w3.org/ns/prov#wasDerivedFrom"
 RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
 RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
 TG = "https://trustgraph.ai/ns/"
-TG_REIFIES = TG + "reifies"
+TG_CONTAINS = TG + "contains"
+TG_DOCUMENT_TYPE = TG + "Document"
+TG_PAGE_TYPE = TG + "Page"
+TG_CHUNK_TYPE = TG + "Chunk"
+TG_SUBGRAPH_TYPE = TG + "Subgraph"
 DC_TITLE = "http://purl.org/dc/terms/title"
 DC_FORMAT = "http://purl.org/dc/terms/format"
 
+# Map TrustGraph type URIs to display names
+TYPE_MAP = {
+    TG_DOCUMENT_TYPE: "document",
+    TG_PAGE_TYPE: "page",
+    TG_CHUNK_TYPE: "chunk",
+    TG_SUBGRAPH_TYPE: "subgraph",
+}
+
 # Source graph
 SOURCE_GRAPH = "urn:graph:source"
 
@@ -109,15 +121,15 @@ def extract_value(term):
 
 
 def get_node_metadata(socket, flow_id, user, collection, node_uri):
-    """Get metadata for a node (label, type, title, format)."""
+    """Get metadata for a node (label, types, title, format)."""
     triples = query_triples(socket, flow_id, user, collection, s=node_uri, g=SOURCE_GRAPH)
 
-    metadata = {"uri": node_uri}
+    metadata = {"uri": node_uri, "types": []}
     for s, p, o in triples:
         if p == RDFS_LABEL:
             metadata["label"] = o
         elif p == RDF_TYPE:
-            metadata["type"] = o
+            metadata["types"].append(o)
         elif p == DC_TITLE:
             metadata["title"] = o
         elif p == DC_FORMAT:
@@ -126,6 +138,14 @@ def get_node_metadata(socket, flow_id, user, collection, node_uri):
     return metadata
 
 
+def classify_node(metadata):
+    """Classify a node based on its rdf:type values."""
+    for type_uri in metadata.get("types", []):
+        if type_uri in TYPE_MAP:
+            return TYPE_MAP[type_uri]
+    return "unknown"
+
+
 def get_children(socket, flow_id, user, collection, parent_uri):
     """Get children of a node via prov:wasDerivedFrom."""
     triples = query_triples(
@@ -135,29 +155,6 @@ def get_children(socket, flow_id, user, collection, parent_uri):
     return [s for s, p, o in triples]
 
 
-def get_edges_from_chunk(socket, flow_id, user, collection, chunk_uri):
-    """Get edges that were derived from a chunk (via tg:reifies)."""
-    # Query for triples where: ?stmt prov:wasDerivedFrom chunk_uri
-    # Then get the tg:reifies value
-    derived_triples = query_triples(
-        socket, flow_id, user, collection,
-        p=PROV_WAS_DERIVED_FROM, o=chunk_uri, g=SOURCE_GRAPH
-    )
-
-    edges = []
-    for stmt_uri, _, _ in derived_triples:
-        # Get what this statement reifies
-        reifies_triples = query_triples(
-            socket, flow_id, user, collection,
-            s=stmt_uri, p=TG_REIFIES, g=SOURCE_GRAPH
-        )
-        for _, _, edge in reifies_triples:
-            if isinstance(edge, dict):
-                edges.append(edge)
-
-    return edges
-
-
 def get_document_content(api, user, doc_id, max_content):
     """Fetch document content from librarian API."""
     try:
@@ -176,32 +173,6 @@ def get_document_content(api, user, doc_id, max_content):
         return f"[Error fetching content: {e}]"
 
 
-def classify_uri(uri):
-    """Classify a URI as document, page, or chunk based on patterns."""
-    if not isinstance(uri, str):
-        return "unknown"
-
-    # Common patterns in trustgraph URIs
-    if "/c" in uri and uri.split("/c")[-1].isdigit():
-        return "chunk"
-    if "/p" in uri and any(uri.split("/p")[-1].replace("/", "").isdigit() for _ in [1]):
-        # Check for page pattern like /p1 or /p1/
-        parts = uri.split("/p")
-        if len(parts) > 1:
-            remainder = parts[-1].split("/")[0]
-            if remainder.isdigit():
-                return "page"
-
-    if "chunk" in uri.lower():
-        return "chunk"
-    if "page" in uri.lower():
-        return "page"
-    if "doc" in uri.lower():
-        return "document"
-
-    return "unknown"
-
-
 def build_hierarchy(socket, flow_id, user, collection, root_uri, api=None, show_content=False, max_content=200, visited=None):
     """Build document hierarchy tree recursively."""
     if visited is None:
@@ -212,7 +183,7 @@ def build_hierarchy(socket, flow_id, user, collection, root_uri, api=None, show_
     visited.add(root_uri)
 
     metadata = get_node_metadata(socket, flow_id, user, collection, root_uri)
-    node_type = classify_uri(root_uri)
+    node_type = classify_node(metadata)
 
     node = {
         "uri": root_uri,
@@ -232,10 +203,20 @@ def build_hierarchy(socket, flow_id, user, collection, root_uri, api=None, show_
     children_uris = get_children(socket, flow_id, user, collection, root_uri)
 
     for child_uri in children_uris:
-        child_type = classify_uri(child_uri)
+        child_metadata = get_node_metadata(socket, flow_id, user, collection, child_uri)
+        child_type = classify_node(child_metadata)
 
-        # Recursively build hierarchy for pages and chunks
-        if child_type in ("page", "chunk", "unknown"):
+        if child_type == "subgraph":
+            # Subgraphs contain extracted edges — inline them
+            contains_triples = query_triples(
+                socket, flow_id, user, collection,
+                s=child_uri, p=TG_CONTAINS, g=SOURCE_GRAPH
+            )
+            for _, _, edge in contains_triples:
+                if isinstance(edge, dict):
+                    node["edges"].append(edge)
+        else:
+            # Recurse into pages, chunks, etc.
             child_node = build_hierarchy(
                 socket, flow_id, user, collection, child_uri,
                 api=api, show_content=show_content, max_content=max_content,
@@ -244,11 +225,6 @@ def build_hierarchy(socket, flow_id, user, collection, root_uri, api=None, show_
             if child_node:
                 node["children"].append(child_node)
 
-    # Get edges for chunks
-    if node_type == "chunk":
-        edges = get_edges_from_chunk(socket, flow_id, user, collection, root_uri)
-        node["edges"] = edges
-
     # Sort children by URI for consistent output
     node["children"].sort(key=lambda x: x.get("uri", ""))
 
@@ -332,7 +308,7 @@ def print_json(node):
 
 def main():
     parser = argparse.ArgumentParser(
-        prog='tg-show-document-hierarchy',
+        prog='tg-show-extraction-provenance',
         description=__doc__,
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
diff --git a/trustgraph-flow/trustgraph/extract/kg/agent/extract.py b/trustgraph-flow/trustgraph/extract/kg/agent/extract.py
index a00944d6..28dba11a 100644
--- a/trustgraph-flow/trustgraph/extract/kg/agent/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/agent/extract.py
@@ -11,6 +11,8 @@ from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, SUBJECT_OF, DEFINITION
 from ....base import FlowProcessor, ConsumerSpec, ProducerSpec
 from ....base import AgentClientSpec
 
+from ....provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE
+from ....flow_version import __version__ as COMPONENT_VERSION
 from ....template import PromptManager
 
 # Module logger
@@ -196,9 +198,21 @@ class Processor(FlowProcessor):
                 return
             
             # Process extraction data
-            triples, entity_contexts = self.process_extraction_data(
-                extraction_data, v.metadata
-            )
+            triples, entity_contexts, extracted_triples = \
+                self.process_extraction_data(extraction_data, v.metadata)
+
+            # Generate subgraph provenance for extracted triples
+            if extracted_triples:
+                chunk_uri = v.metadata.id
+                sg_uri = subgraph_uri()
+                prov_triples = subgraph_provenance_triples(
+                    subgraph_uri=sg_uri,
+                    extracted_triples=extracted_triples,
+                    chunk_uri=chunk_uri,
+                    component_name=default_ident,
+                    component_version=COMPONENT_VERSION,
+                )
+                triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
 
             # Emit outputs
             if triples:
@@ -221,8 +235,13 @@ class Processor(FlowProcessor):
         Data is a flat list of objects with 'type' discriminator field:
         - {"type": "definition", "entity": "...", "definition": "..."}
         - {"type": "relationship", "subject": "...", "predicate": "...", "object": "...", "object-entity": bool}
+
+        Returns:
+            Tuple of (all_triples, entity_contexts, extracted_triples) where
+            extracted_triples contains only the core knowledge facts (for provenance).
         """
         triples = []
+        extracted_triples = []
         entity_contexts = []
 
         # Categorize items by type
@@ -242,11 +261,13 @@ class Processor(FlowProcessor):
             ))
 
             # Add definition
-            triples.append(Triple(
+            definition_triple = Triple(
                 s = Term(type=IRI, iri=entity_uri),
                 p = Term(type=IRI, iri=DEFINITION),
                 o = Term(type=LITERAL, value=defn["definition"]),
-            ))
+            )
+            triples.append(definition_triple)
+            extracted_triples.append(definition_triple)
 
             # Add subject-of relationship to document
             if metadata.id:
@@ -261,7 +282,7 @@ class Processor(FlowProcessor):
                 entity=Term(type=IRI, iri=entity_uri),
                 context=defn["definition"]
             ))
-        
+
         # Process relationships
         for rel in relationships:
 
@@ -298,11 +319,13 @@ class Processor(FlowProcessor):
                 ))
 
             # Add the main relationship triple
-            triples.append(Triple(
+            relationship_triple = Triple(
                 s = subject_value,
                 p = predicate_value,
                 o = object_value
-            ))
+            )
+            triples.append(relationship_triple)
+            extracted_triples.append(relationship_triple)
 
             # Add subject-of relationships to document
             if metadata.id:
@@ -324,8 +347,8 @@ class Processor(FlowProcessor):
                         p = Term(type=IRI, iri=SUBJECT_OF),
                         o = Term(type=IRI, iri=metadata.id),
                     ))
-        
-        return triples, entity_contexts
+
+        return triples, entity_contexts, extracted_triples
 
     @staticmethod
     def add_args(parser):
diff --git a/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py b/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py
index 277157b9..44ea778d 100755
--- a/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py
@@ -20,7 +20,7 @@ from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
 from .... base import FlowProcessor, ConsumerSpec,  ProducerSpec
 from .... base import PromptClientSpec, ParameterSpec
 
-from .... provenance import statement_uri, triple_provenance_triples, set_graph, GRAPH_SOURCE
+from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE
 from .... flow_version import __version__ as COMPONENT_VERSION
 
 DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION)
@@ -133,6 +133,7 @@ class Processor(FlowProcessor):
                 raise e
 
             triples = []
+            extracted_triples = []
             entities = []
 
             # Get chunk document ID for provenance linking
@@ -173,20 +174,7 @@ class Processor(FlowProcessor):
                     s=s_value, p=DEFINITION_VALUE, o=o_value
                 )
                 triples.append(definition_triple)
-
-                # Generate provenance for the definition triple (reification)
-                # Provenance triples go in the source graph for separation from core knowledge
-                stmt_uri = statement_uri()
-                prov_triples = triple_provenance_triples(
-                    stmt_uri=stmt_uri,
-                    extracted_triple=definition_triple,
-                    chunk_uri=chunk_uri,
-                    component_name=default_ident,
-                    component_version=COMPONENT_VERSION,
-                    llm_model=llm_model,
-                    ontology_uri=ontology_uri,
-                )
-                triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
+                extracted_triples.append(definition_triple)
 
                 # Link entity to chunk (not top-level document)
                 triples.append(Triple(
@@ -211,6 +199,20 @@ class Processor(FlowProcessor):
                     chunk_id=chunk_doc_id,
                 ))
 
+            # Generate subgraph provenance once for all extracted triples
+            if extracted_triples:
+                sg_uri = subgraph_uri()
+                prov_triples = subgraph_provenance_triples(
+                    subgraph_uri=sg_uri,
+                    extracted_triples=extracted_triples,
+                    chunk_uri=chunk_uri,
+                    component_name=default_ident,
+                    component_version=COMPONENT_VERSION,
+                    llm_model=llm_model,
+                    ontology_uri=ontology_uri,
+                )
+                triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
+
             # Send triples in batches
             for i in range(0, len(triples), self.triples_batch_size):
                 batch = triples[i:i + self.triples_batch_size]
diff --git a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py
index 16383752..5078d817 100644
--- a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py
@@ -23,6 +23,9 @@ from .ontology_selector import OntologySelector, OntologySubset
 from .simplified_parser import parse_extraction_response
 from .triple_converter import TripleConverter
 
+from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE
+from .... flow_version import __version__ as COMPONENT_VERSION
+
 logger = logging.getLogger(__name__)
 
 default_ident = "kg-extract-ontology"
@@ -306,11 +309,25 @@ class Processor(FlowProcessor):
                 flow, chunk, ontology_subset, prompt_variables
             )
 
+            # Generate subgraph provenance for extracted triples
+            if triples:
+                chunk_uri = v.metadata.id
+                sg_uri = subgraph_uri()
+                prov_triples = subgraph_provenance_triples(
+                    subgraph_uri=sg_uri,
+                    extracted_triples=triples,
+                    chunk_uri=chunk_uri,
+                    component_name=default_ident,
+                    component_version=COMPONENT_VERSION,
+                )
+
             # Generate ontology definition triples
             ontology_triples = self.build_ontology_triples(ontology_subset)
 
-            # Combine extracted triples with ontology triples
+            # Combine extracted triples with ontology triples and provenance
             all_triples = triples + ontology_triples
+            if triples:
+                all_triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
 
             # Build entity contexts from all triples (including ontology elements)
             entity_contexts = self.build_entity_contexts(all_triples)
diff --git a/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py b/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py
index a136acc6..604de1df 100755
--- a/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py
@@ -20,7 +20,7 @@ from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
 from .... base import FlowProcessor, ConsumerSpec,  ProducerSpec
 from .... base import PromptClientSpec, ParameterSpec
 
-from .... provenance import statement_uri, triple_provenance_triples, set_graph, GRAPH_SOURCE
+from .... provenance import subgraph_uri, subgraph_provenance_triples, set_graph, GRAPH_SOURCE
 from .... flow_version import __version__ as COMPONENT_VERSION
 
 RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
@@ -115,6 +115,7 @@ class Processor(FlowProcessor):
                 raise e
 
             triples = []
+            extracted_triples = []
 
             # Get chunk document ID for provenance linking
             chunk_doc_id = v.document_id if v.document_id else v.metadata.id
@@ -160,20 +161,7 @@ class Processor(FlowProcessor):
                     o=o_value
                 )
                 triples.append(relationship_triple)
-
-                # Generate provenance for the relationship triple (reification)
-                # Provenance triples go in the source graph for separation from core knowledge
-                stmt_uri = statement_uri()
-                prov_triples = triple_provenance_triples(
-                    stmt_uri=stmt_uri,
-                    extracted_triple=relationship_triple,
-                    chunk_uri=chunk_uri,
-                    component_name=default_ident,
-                    component_version=COMPONENT_VERSION,
-                    llm_model=llm_model,
-                    ontology_uri=ontology_uri,
-                )
-                triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
+                extracted_triples.append(relationship_triple)
 
                 # Label for s
                 triples.append(Triple(
@@ -212,6 +200,20 @@ class Processor(FlowProcessor):
                         o=Term(type=IRI, iri=chunk_uri)
                     ))
 
+            # Generate subgraph provenance once for all extracted triples
+            if extracted_triples:
+                sg_uri = subgraph_uri()
+                prov_triples = subgraph_provenance_triples(
+                    subgraph_uri=sg_uri,
+                    extracted_triples=extracted_triples,
+                    chunk_uri=chunk_uri,
+                    component_name=default_ident,
+                    component_version=COMPONENT_VERSION,
+                    llm_model=llm_model,
+                    ontology_uri=ontology_uri,
+                )
+                triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
+
             # Send triples in batches
             for i in range(0, len(triples), self.triples_batch_size):
                 batch = triples[i:i + self.triples_batch_size]