Extract-time provenance (#661)

1. Shared Provenance Module - URI generators, namespace constants, triple builders, vocabulary bootstrap 2. Librarian - Emits document metadata to graph on processing initiation (vocabulary bootstrap + PROV-O triples) 3. PDF Extractor - Saves pages as child documents, emits parent-child provenance edges, forwards page IDs 4. Chunker - Saves chunks as child documents, emits provenance edges, forwards chunk ID + content 5. Knowledge Extractors (both definitions and relationships): - Link entities to chunks via SUBJECT_OF (not top-level document) - Removed duplicate metadata emission (now handled by librarian) - Get chunk_doc_id and chunk_uri from incoming Chunk message 6. Embedding Provenance: - EntityContext schema has chunk_id field - EntityEmbeddings schema has chunk_id field - Definitions extractor sets chunk_id when creating EntityContext - Graph embeddings processor passes chunk_id through to EntityEmbeddings Provenance Flow: Document → Page (PDF) → Chunk → Extracted Facts/Embeddings ↓ ↓ ↓ ↓ librarian librarian librarian (chunk_id reference) + graph + graph + graph Each artifact is stored in librarian with parent-child linking, and PROV-O edges are emitted to the knowledge graph for full traceability from any extracted fact back to its source document. Also, updating tests
2026-06-21 20:58:06 +02:00 · 2026-03-05 18:36:10 +00:00 · 2026-03-05 18:36:10 +00:00 · cd5580be59
commit cd5580be59
parent d8f0a576af
20 changed files with 1601 additions and 59 deletions
--- a/trustgraph-base/trustgraph/base/chunking_service.py
+++ b/trustgraph-base/trustgraph/base/chunking_service.py
@ -15,7 +15,7 @@ from .consumer import Consumer
 from .producer import Producer
 from .metrics import ConsumerMetrics, ProducerMetrics

-from ..schema import LibrarianRequest, LibrarianResponse
+from ..schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
 from ..schema import librarian_request_queue, librarian_response_queue

 # Module logger
@ -135,6 +135,67 @@ class ChunkingService(FlowProcessor):
            self.pending_requests.pop(request_id, None)
            raise RuntimeError(f"Timeout fetching document {document_id}")

+    async def save_child_document(self, doc_id, parent_id, user, content,
+                                   document_type="chunk", title=None, timeout=120):
+        """
+        Save a child document (chunk) to the librarian.
+
+        Args:
+            doc_id: ID for the new child document
+            parent_id: ID of the parent document
+            user: User ID
+            content: Document content (bytes or str)
+            document_type: Type of document ("chunk", etc.)
+            title: Optional title
+            timeout: Request timeout in seconds
+
+        Returns:
+            The document ID on success
+        """
+        request_id = str(uuid.uuid4())
+
+        if isinstance(content, str):
+            content = content.encode("utf-8")
+
+        doc_metadata = DocumentMetadata(
+            id=doc_id,
+            user=user,
+            kind="text/plain",
+            title=title or doc_id,
+            parent_id=parent_id,
+            document_type=document_type,
+        )
+
+        request = LibrarianRequest(
+            operation="add-child-document",
+            document_metadata=doc_metadata,
+            content=base64.b64encode(content).decode("utf-8"),
+        )
+
+        # Create future for response
+        future = asyncio.get_event_loop().create_future()
+        self.pending_requests[request_id] = future
+
+        try:
+            # Send request
+            await self.librarian_request_producer.send(
+                request, properties={"id": request_id}
+            )
+
+            # Wait for response
+            response = await asyncio.wait_for(future, timeout=timeout)
+
+            if response.error:
+                raise RuntimeError(
+                    f"Librarian error saving chunk: {response.error.type}: {response.error.message}"
+                )
+
+            return doc_id
+
+        except asyncio.TimeoutError:
+            self.pending_requests.pop(request_id, None)
+            raise RuntimeError(f"Timeout saving chunk {doc_id}")
+
    async def get_document_text(self, doc):
        """
        Get text content from a TextDocument, fetching from librarian if needed.
--- a/trustgraph-base/trustgraph/provenance/init.py
+++ b/trustgraph-base/trustgraph/provenance/init.py
@ -0,0 +1,110 @@
+"""
+Provenance module for extraction-time provenance support.
+
+Provides helpers for:
+- URI generation for documents, pages, chunks, activities, statements
+- PROV-O triple building for provenance metadata
+- Vocabulary bootstrap for per-collection initialization
+
+Usage example:
+
+    from trustgraph.provenance import (
+        document_uri, page_uri, chunk_uri_from_page,
+        document_triples, derived_entity_triples,
+        get_vocabulary_triples,
+    )
+
+    # Generate URIs
+    doc_uri = document_uri("my-doc-123")
+    page_uri = page_uri("my-doc-123", page_number=1)
+
+    # Build provenance triples
+    triples = document_triples(
+        doc_uri,
+        title="My Document",
+        mime_type="application/pdf",
+        page_count=10,
+    )
+
+    # Get vocabulary bootstrap triples (once per collection)
+    vocab_triples = get_vocabulary_triples()
+"""
+
+# URI generation
+from . uris import (
+    TRUSTGRAPH_BASE,
+    document_uri,
+    page_uri,
+    chunk_uri_from_page,
+    chunk_uri_from_doc,
+    activity_uri,
+    statement_uri,
+    agent_uri,
+)
+
+# Namespace constants
+from . namespaces import (
+    # PROV-O
+    PROV, PROV_ENTITY, PROV_ACTIVITY, PROV_AGENT,
+    PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
+    PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
+    # Dublin Core
+    DC, DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
+    # RDF/RDFS
+    RDF, RDF_TYPE, RDFS, RDFS_LABEL,
+    # TrustGraph
+    TG, TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
+    TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
+    TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
+    TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
+    TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
+)
+
+# Triple builders
+from . triples import (
+    document_triples,
+    derived_entity_triples,
+    triple_provenance_triples,
+)
+
+# Vocabulary bootstrap
+from . vocabulary import (
+    get_vocabulary_triples,
+    PROV_CLASS_LABELS,
+    PROV_PREDICATE_LABELS,
+    DC_PREDICATE_LABELS,
+    TG_PREDICATE_LABELS,
+)
+
+__all__ = [
+    # URIs
+    "TRUSTGRAPH_BASE",
+    "document_uri",
+    "page_uri",
+    "chunk_uri_from_page",
+    "chunk_uri_from_doc",
+    "activity_uri",
+    "statement_uri",
+    "agent_uri",
+    # Namespaces
+    "PROV", "PROV_ENTITY", "PROV_ACTIVITY", "PROV_AGENT",
+    "PROV_WAS_DERIVED_FROM", "PROV_WAS_GENERATED_BY",
+    "PROV_USED", "PROV_WAS_ASSOCIATED_WITH", "PROV_STARTED_AT_TIME",
+    "DC", "DC_TITLE", "DC_SOURCE", "DC_DATE", "DC_CREATOR",
+    "RDF", "RDF_TYPE", "RDFS", "RDFS_LABEL",
+    "TG", "TG_REIFIES", "TG_PAGE_COUNT", "TG_MIME_TYPE", "TG_PAGE_NUMBER",
+    "TG_CHUNK_INDEX", "TG_CHAR_OFFSET", "TG_CHAR_LENGTH",
+    "TG_CHUNK_SIZE", "TG_CHUNK_OVERLAP", "TG_COMPONENT_VERSION",
+    "TG_LLM_MODEL", "TG_ONTOLOGY", "TG_EMBEDDING_MODEL",
+    "TG_SOURCE_TEXT", "TG_SOURCE_CHAR_OFFSET", "TG_SOURCE_CHAR_LENGTH",
+    # Triple builders
+    "document_triples",
+    "derived_entity_triples",
+    "triple_provenance_triples",
+    # Vocabulary
+    "get_vocabulary_triples",
+    "PROV_CLASS_LABELS",
+    "PROV_PREDICATE_LABELS",
+    "DC_PREDICATE_LABELS",
+    "TG_PREDICATE_LABELS",
+]
--- a/trustgraph-base/trustgraph/provenance/namespaces.py
+++ b/trustgraph-base/trustgraph/provenance/namespaces.py
@ -0,0 +1,48 @@
+"""
+RDF namespace constants for provenance.
+
+Includes PROV-O, Dublin Core, and TrustGraph namespace URIs.
+"""
+
+# PROV-O namespace (W3C Provenance Ontology)
+PROV = "http://www.w3.org/ns/prov#"
+PROV_ENTITY = PROV + "Entity"
+PROV_ACTIVITY = PROV + "Activity"
+PROV_AGENT = PROV + "Agent"
+PROV_WAS_DERIVED_FROM = PROV + "wasDerivedFrom"
+PROV_WAS_GENERATED_BY = PROV + "wasGeneratedBy"
+PROV_USED = PROV + "used"
+PROV_WAS_ASSOCIATED_WITH = PROV + "wasAssociatedWith"
+PROV_STARTED_AT_TIME = PROV + "startedAtTime"
+
+# Dublin Core namespace
+DC = "http://purl.org/dc/elements/1.1/"
+DC_TITLE = DC + "title"
+DC_SOURCE = DC + "source"
+DC_DATE = DC + "date"
+DC_CREATOR = DC + "creator"
+
+# RDF/RDFS namespace (also in rdf.py, but included here for completeness)
+RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+RDF_TYPE = RDF + "type"
+RDFS = "http://www.w3.org/2000/01/rdf-schema#"
+RDFS_LABEL = RDFS + "label"
+
+# TrustGraph namespace for custom predicates
+TG = "https://trustgraph.ai/ns/"
+TG_REIFIES = TG + "reifies"
+TG_PAGE_COUNT = TG + "pageCount"
+TG_MIME_TYPE = TG + "mimeType"
+TG_PAGE_NUMBER = TG + "pageNumber"
+TG_CHUNK_INDEX = TG + "chunkIndex"
+TG_CHAR_OFFSET = TG + "charOffset"
+TG_CHAR_LENGTH = TG + "charLength"
+TG_CHUNK_SIZE = TG + "chunkSize"
+TG_CHUNK_OVERLAP = TG + "chunkOverlap"
+TG_COMPONENT_VERSION = TG + "componentVersion"
+TG_LLM_MODEL = TG + "llmModel"
+TG_ONTOLOGY = TG + "ontology"
+TG_EMBEDDING_MODEL = TG + "embeddingModel"
+TG_SOURCE_TEXT = TG + "sourceText"
+TG_SOURCE_CHAR_OFFSET = TG + "sourceCharOffset"
+TG_SOURCE_CHAR_LENGTH = TG + "sourceCharLength"
--- a/trustgraph-base/trustgraph/provenance/triples.py
+++ b/trustgraph-base/trustgraph/provenance/triples.py
@ -0,0 +1,251 @@
+"""
+Helper functions to build PROV-O triples for extraction-time provenance.
+"""
+
+from datetime import datetime
+from typing import List, Optional
+
+from .. schema import Triple, Term, IRI, LITERAL
+
+from . namespaces import (
+    RDF_TYPE, RDFS_LABEL,
+    PROV_ENTITY, PROV_ACTIVITY, PROV_AGENT,
+    PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
+    PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
+    DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
+    TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
+    TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
+    TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
+    TG_LLM_MODEL, TG_ONTOLOGY, TG_REIFIES,
+)
+
+from . uris import activity_uri, agent_uri
+
+
+def _iri(uri: str) -> Term:
+    """Create an IRI term."""
+    return Term(type=IRI, iri=uri)
+
+
+def _literal(value) -> Term:
+    """Create a literal term."""
+    return Term(type=LITERAL, value=str(value))
+
+
+def _triple(s: str, p: str, o_term: Term) -> Triple:
+    """Create a triple with IRI subject and predicate."""
+    return Triple(s=_iri(s), p=_iri(p), o=o_term)
+
+
+def document_triples(
+    doc_uri: str,
+    title: Optional[str] = None,
+    source: Optional[str] = None,
+    date: Optional[str] = None,
+    creator: Optional[str] = None,
+    page_count: Optional[int] = None,
+    mime_type: Optional[str] = None,
+) -> List[Triple]:
+    """
+    Build triples for a source document entity.
+
+    Args:
+        doc_uri: The document URI (from uris.document_uri)
+        title: Document title
+        source: Source URL/path
+        date: Document date
+        creator: Author/creator
+        page_count: Number of pages (for PDFs)
+        mime_type: MIME type
+
+    Returns:
+        List of Triple objects
+    """
+    triples = [
+        _triple(doc_uri, RDF_TYPE, _iri(PROV_ENTITY)),
+    ]
+
+    if title:
+        triples.append(_triple(doc_uri, DC_TITLE, _literal(title)))
+        triples.append(_triple(doc_uri, RDFS_LABEL, _literal(title)))
+
+    if source:
+        triples.append(_triple(doc_uri, DC_SOURCE, _iri(source)))
+
+    if date:
+        triples.append(_triple(doc_uri, DC_DATE, _literal(date)))
+
+    if creator:
+        triples.append(_triple(doc_uri, DC_CREATOR, _literal(creator)))
+
+    if page_count is not None:
+        triples.append(_triple(doc_uri, TG_PAGE_COUNT, _literal(page_count)))
+
+    if mime_type:
+        triples.append(_triple(doc_uri, TG_MIME_TYPE, _literal(mime_type)))
+
+    return triples
+
+
+def derived_entity_triples(
+    entity_uri: str,
+    parent_uri: str,
+    component_name: str,
+    component_version: str,
+    label: Optional[str] = None,
+    page_number: Optional[int] = None,
+    chunk_index: Optional[int] = None,
+    char_offset: Optional[int] = None,
+    char_length: Optional[int] = None,
+    chunk_size: Optional[int] = None,
+    chunk_overlap: Optional[int] = None,
+    timestamp: Optional[str] = None,
+) -> List[Triple]:
+    """
+    Build triples for a derived entity (page or chunk) with full PROV-O provenance.
+
+    Creates:
+    - Entity declaration
+    - wasDerivedFrom relationship to parent
+    - Activity for the extraction
+    - Agent for the component
+
+    Args:
+        entity_uri: URI of the derived entity (page or chunk)
+        parent_uri: URI of the parent entity
+        component_name: Name of TG component (e.g., "pdf-extractor", "chunker")
+        component_version: Version of the component
+        label: Human-readable label
+        page_number: Page number (for pages)
+        chunk_index: Chunk index (for chunks)
+        char_offset: Character offset in parent (for chunks)
+        char_length: Character length (for chunks)
+        chunk_size: Configured chunk size (for chunking activity)
+        chunk_overlap: Configured chunk overlap (for chunking activity)
+        timestamp: ISO timestamp (defaults to now)
+
+    Returns:
+        List of Triple objects
+    """
+    if timestamp is None:
+        timestamp = datetime.utcnow().isoformat() + "Z"
+
+    act_uri = activity_uri()
+    agt_uri = agent_uri(component_name)
+
+    triples = [
+        # Entity declaration
+        _triple(entity_uri, RDF_TYPE, _iri(PROV_ENTITY)),
+
+        # Derivation from parent
+        _triple(entity_uri, PROV_WAS_DERIVED_FROM, _iri(parent_uri)),
+
+        # Generation by activity
+        _triple(entity_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)),
+
+        # Activity declaration
+        _triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
+        _triple(act_uri, PROV_USED, _iri(parent_uri)),
+        _triple(act_uri, PROV_WAS_ASSOCIATED_WITH, _iri(agt_uri)),
+        _triple(act_uri, PROV_STARTED_AT_TIME, _literal(timestamp)),
+        _triple(act_uri, TG_COMPONENT_VERSION, _literal(component_version)),
+
+        # Agent declaration
+        _triple(agt_uri, RDF_TYPE, _iri(PROV_AGENT)),
+        _triple(agt_uri, RDFS_LABEL, _literal(component_name)),
+    ]
+
+    if label:
+        triples.append(_triple(entity_uri, RDFS_LABEL, _literal(label)))
+
+    if page_number is not None:
+        triples.append(_triple(entity_uri, TG_PAGE_NUMBER, _literal(page_number)))
+
+    if chunk_index is not None:
+        triples.append(_triple(entity_uri, TG_CHUNK_INDEX, _literal(chunk_index)))
+
+    if char_offset is not None:
+        triples.append(_triple(entity_uri, TG_CHAR_OFFSET, _literal(char_offset)))
+
+    if char_length is not None:
+        triples.append(_triple(entity_uri, TG_CHAR_LENGTH, _literal(char_length)))
+
+    if chunk_size is not None:
+        triples.append(_triple(act_uri, TG_CHUNK_SIZE, _literal(chunk_size)))
+
+    if chunk_overlap is not None:
+        triples.append(_triple(act_uri, TG_CHUNK_OVERLAP, _literal(chunk_overlap)))
+
+    return triples
+
+
+def triple_provenance_triples(
+    stmt_uri: str,
+    subject_uri: str,
+    predicate_uri: str,
+    object_term: Term,
+    chunk_uri: str,
+    component_name: str,
+    component_version: str,
+    llm_model: Optional[str] = None,
+    ontology_uri: Optional[str] = None,
+    timestamp: Optional[str] = None,
+) -> List[Triple]:
+    """
+    Build provenance triples for an extracted knowledge triple using reification.
+
+    Creates:
+    - Statement object that reifies the triple
+    - wasDerivedFrom link to source chunk
+    - Activity and agent metadata
+
+    Args:
+        stmt_uri: URI for the reified statement
+        subject_uri: Subject of the extracted triple
+        predicate_uri: Predicate of the extracted triple
+        object_term: Object of the extracted triple (Term)
+        chunk_uri: URI of source chunk
+        component_name: Name of extractor component
+        component_version: Version of the component
+        llm_model: LLM model used for extraction
+        ontology_uri: Ontology URI used for extraction
+        timestamp: ISO timestamp
+
+    Returns:
+        List of Triple objects for the provenance (not the triple itself)
+    """
+    if timestamp is None:
+        timestamp = datetime.utcnow().isoformat() + "Z"
+
+    act_uri = activity_uri()
+    agt_uri = agent_uri(component_name)
+
+    # Note: The actual reification (tg:reifies pointing at the edge) requires
+    # RDF 1.2 triple term support. This builds the surrounding provenance.
+    # The actual reification link must be handled by the knowledge extractor
+    # using the graph store's reification API.
+
+    triples = [
+        # Statement provenance
+        _triple(stmt_uri, PROV_WAS_DERIVED_FROM, _iri(chunk_uri)),
+        _triple(stmt_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)),
+
+        # Activity
+        _triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
+        _triple(act_uri, PROV_USED, _iri(chunk_uri)),
+        _triple(act_uri, PROV_WAS_ASSOCIATED_WITH, _iri(agt_uri)),
+        _triple(act_uri, PROV_STARTED_AT_TIME, _literal(timestamp)),
+        _triple(act_uri, TG_COMPONENT_VERSION, _literal(component_version)),
+
+        # Agent
+        _triple(agt_uri, RDF_TYPE, _iri(PROV_AGENT)),
+        _triple(agt_uri, RDFS_LABEL, _literal(component_name)),
+    ]
+
+    if llm_model:
+        triples.append(_triple(act_uri, TG_LLM_MODEL, _literal(llm_model)))
+
+    if ontology_uri:
+        triples.append(_triple(act_uri, TG_ONTOLOGY, _iri(ontology_uri)))
+
+    return triples
--- a/trustgraph-base/trustgraph/provenance/uris.py
+++ b/trustgraph-base/trustgraph/provenance/uris.py
@ -0,0 +1,61 @@
+"""
+URI generation for provenance entities.
+
+URI patterns:
+- Document:  https://trustgraph.ai/doc/{doc_id}
+- Page:      https://trustgraph.ai/page/{doc_id}/p{page_number}
+- Chunk:     https://trustgraph.ai/chunk/{doc_id}/p{page}/c{chunk} (from page)
+             https://trustgraph.ai/chunk/{doc_id}/c{chunk} (from text doc)
+- Activity:  https://trustgraph.ai/activity/{uuid}
+- Statement: https://trustgraph.ai/stmt/{uuid}
+"""
+
+import uuid
+import urllib.parse
+
+# Base URI prefix
+TRUSTGRAPH_BASE = "https://trustgraph.ai"
+
+
+def _encode_id(id_str: str) -> str:
+    """URL-encode an ID component for safe inclusion in URIs."""
+    return urllib.parse.quote(str(id_str), safe='')
+
+
+def document_uri(doc_id: str) -> str:
+    """Generate URI for a source document."""
+    return f"{TRUSTGRAPH_BASE}/doc/{_encode_id(doc_id)}"
+
+
+def page_uri(doc_id: str, page_number: int) -> str:
+    """Generate URI for a page extracted from a document."""
+    return f"{TRUSTGRAPH_BASE}/page/{_encode_id(doc_id)}/p{page_number}"
+
+
+def chunk_uri_from_page(doc_id: str, page_number: int, chunk_index: int) -> str:
+    """Generate URI for a chunk extracted from a page."""
+    return f"{TRUSTGRAPH_BASE}/chunk/{_encode_id(doc_id)}/p{page_number}/c{chunk_index}"
+
+
+def chunk_uri_from_doc(doc_id: str, chunk_index: int) -> str:
+    """Generate URI for a chunk extracted directly from a text document."""
+    return f"{TRUSTGRAPH_BASE}/chunk/{_encode_id(doc_id)}/c{chunk_index}"
+
+
+def activity_uri(activity_id: str = None) -> str:
+    """Generate URI for a PROV-O activity. Auto-generates UUID if not provided."""
+    if activity_id is None:
+        activity_id = str(uuid.uuid4())
+    return f"{TRUSTGRAPH_BASE}/activity/{_encode_id(activity_id)}"
+
+
+def statement_uri(stmt_id: str = None) -> str:
+    """Generate URI for a reified statement. Auto-generates UUID if not provided."""
+    if stmt_id is None:
+        stmt_id = str(uuid.uuid4())
+    return f"{TRUSTGRAPH_BASE}/stmt/{_encode_id(stmt_id)}"
+
+
+def agent_uri(component_name: str) -> str:
+    """Generate URI for a TrustGraph component agent."""
+    return f"{TRUSTGRAPH_BASE}/agent/{_encode_id(component_name)}"
--- a/trustgraph-base/trustgraph/provenance/vocabulary.py
+++ b/trustgraph-base/trustgraph/provenance/vocabulary.py
@ -0,0 +1,101 @@
+"""
+Vocabulary bootstrap for provenance.
+
+The knowledge graph is ontology-neutral and initializes empty. When writing
+PROV-O provenance data to a collection for the first time, the vocabulary
+must be bootstrapped with RDF labels for all classes and predicates.
+"""
+
+from typing import List
+
+from .. schema import Triple, Term, IRI, LITERAL
+
+from . namespaces import (
+    RDFS_LABEL,
+    PROV_ENTITY, PROV_ACTIVITY, PROV_AGENT,
+    PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
+    PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
+    DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
+    TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
+    TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
+    TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
+    TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
+    TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
+)
+
+
+def _label_triple(uri: str, label: str) -> Triple:
+    """Create a label triple for a URI."""
+    return Triple(
+        s=Term(type=IRI, iri=uri),
+        p=Term(type=IRI, iri=RDFS_LABEL),
+        o=Term(type=LITERAL, value=label),
+    )
+
+
+# PROV-O class labels
+PROV_CLASS_LABELS = [
+    _label_triple(PROV_ENTITY, "Entity"),
+    _label_triple(PROV_ACTIVITY, "Activity"),
+    _label_triple(PROV_AGENT, "Agent"),
+]
+
+# PROV-O predicate labels
+PROV_PREDICATE_LABELS = [
+    _label_triple(PROV_WAS_DERIVED_FROM, "was derived from"),
+    _label_triple(PROV_WAS_GENERATED_BY, "was generated by"),
+    _label_triple(PROV_USED, "used"),
+    _label_triple(PROV_WAS_ASSOCIATED_WITH, "was associated with"),
+    _label_triple(PROV_STARTED_AT_TIME, "started at"),
+]
+
+# Dublin Core predicate labels
+DC_PREDICATE_LABELS = [
+    _label_triple(DC_TITLE, "title"),
+    _label_triple(DC_SOURCE, "source"),
+    _label_triple(DC_DATE, "date"),
+    _label_triple(DC_CREATOR, "creator"),
+]
+
+# TrustGraph predicate labels
+TG_PREDICATE_LABELS = [
+    _label_triple(TG_REIFIES, "reifies"),
+    _label_triple(TG_PAGE_COUNT, "page count"),
+    _label_triple(TG_MIME_TYPE, "MIME type"),
+    _label_triple(TG_PAGE_NUMBER, "page number"),
+    _label_triple(TG_CHUNK_INDEX, "chunk index"),
+    _label_triple(TG_CHAR_OFFSET, "character offset"),
+    _label_triple(TG_CHAR_LENGTH, "character length"),
+    _label_triple(TG_CHUNK_SIZE, "chunk size"),
+    _label_triple(TG_CHUNK_OVERLAP, "chunk overlap"),
+    _label_triple(TG_COMPONENT_VERSION, "component version"),
+    _label_triple(TG_LLM_MODEL, "LLM model"),
+    _label_triple(TG_ONTOLOGY, "ontology"),
+    _label_triple(TG_EMBEDDING_MODEL, "embedding model"),
+    _label_triple(TG_SOURCE_TEXT, "source text"),
+    _label_triple(TG_SOURCE_CHAR_OFFSET, "source character offset"),
+    _label_triple(TG_SOURCE_CHAR_LENGTH, "source character length"),
+]
+
+
+def get_vocabulary_triples() -> List[Triple]:
+    """
+    Get all vocabulary bootstrap triples.
+
+    Returns a list of triples that define labels for all PROV-O classes,
+    PROV-O predicates, Dublin Core predicates, and TrustGraph predicates
+    used in extraction-time provenance.
+
+    This should be emitted to the knowledge graph once per collection
+    before any provenance data is written. The operation is idempotent -
+    re-emitting the same triples is harmless.
+
+    Returns:
+        List of Triple objects defining vocabulary labels
+    """
+    return (
+        PROV_CLASS_LABELS +
+        PROV_PREDICATE_LABELS +
+        DC_PREDICATE_LABELS +
+        TG_PREDICATE_LABELS
+    )
--- a/trustgraph-base/trustgraph/schema/knowledge/document.py
+++ b/trustgraph-base/trustgraph/schema/knowledge/document.py
@ -34,5 +34,9 @@ class TextDocument:
 class Chunk:
    metadata: Metadata | None = None
    chunk: bytes = b""
+    # For provenance: document_id of this chunk in librarian
+    # Post-chunker optimization: both document_id AND chunk content are included
+    # so downstream processors have the ID for provenance and content to work with
+    document_id: str = ""

 ############################################################################
--- a/trustgraph-base/trustgraph/schema/knowledge/embeddings.py
+++ b/trustgraph-base/trustgraph/schema/knowledge/embeddings.py
@ -12,6 +12,8 @@ from ..core.topic import topic
 class EntityEmbeddings:
    entity: Term | None = None
    vectors: list[list[float]] = field(default_factory=list)
+    # Provenance: which chunk this embedding was derived from
+    chunk_id: str = ""

 # This is a 'batching' mechanism for the above data
@dataclass
--- a/trustgraph-base/trustgraph/schema/knowledge/graph.py
+++ b/trustgraph-base/trustgraph/schema/knowledge/graph.py
@ -12,6 +12,8 @@ from ..core.topic import topic
 class EntityContext:
    entity: Term | None = None
    context: str = ""
+    # Provenance: which chunk this entity context was derived from
+    chunk_id: str = ""

 # This is a 'batching' mechanism for the above data
@dataclass
--- a/trustgraph-base/trustgraph/schema/services/library.py
+++ b/trustgraph-base/trustgraph/schema/services/library.py
@ -91,7 +91,12 @@ class DocumentMetadata:
    tags: list[str] = field(default_factory=list)
    # Child document support
    parent_id: str = ""  # Empty for top-level docs, set for children
-    document_type: str = "source"  # "source" or "extracted"
+    # Document type vocabulary:
+    #   "source" - original uploaded document
+    #   "page" - page extracted from source (e.g., PDF page)
+    #   "chunk" - text chunk derived from page or source
+    #   "extracted" - legacy value, kept for backwards compatibility
+    document_type: str = "source"

@dataclass
 class ProcessingMetadata: