Extract-time provenance (#661)

1. Shared Provenance Module - URI generators, namespace constants, triple builders, vocabulary bootstrap 2. Librarian - Emits document metadata to graph on processing initiation (vocabulary bootstrap + PROV-O triples) 3. PDF Extractor - Saves pages as child documents, emits parent-child provenance edges, forwards page IDs 4. Chunker - Saves chunks as child documents, emits provenance edges, forwards chunk ID + content 5. Knowledge Extractors (both definitions and relationships): - Link entities to chunks via SUBJECT_OF (not top-level document) - Removed duplicate metadata emission (now handled by librarian) - Get chunk_doc_id and chunk_uri from incoming Chunk message 6. Embedding Provenance: - EntityContext schema has chunk_id field - EntityEmbeddings schema has chunk_id field - Definitions extractor sets chunk_id when creating EntityContext - Graph embeddings processor passes chunk_id through to EntityEmbeddings Provenance Flow: Document → Page (PDF) → Chunk → Extracted Facts/Embeddings ↓ ↓ ↓ ↓ librarian librarian librarian (chunk_id reference) + graph + graph + graph Each artifact is stored in librarian with parent-child linking, and PROV-O edges are emitted to the knowledge graph for full traceability from any extracted fact back to its source document. Also, updating tests
2026-04-26 08:56:21 +02:00 · 2026-03-05 18:36:10 +00:00 · 2026-03-05 18:36:10 +00:00 · cd5580be59
commit cd5580be59
parent d8f0a576af
20 changed files with 1601 additions and 59 deletions
--- a/trustgraph-flow/trustgraph/chunking/recursive/chunker.py
+++ b/trustgraph-flow/trustgraph/chunking/recursive/chunker.py
@ -8,9 +8,18 @@ import logging
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from prometheus_client import Histogram

-from ... schema import TextDocument, Chunk
+from ... schema import TextDocument, Chunk, Metadata, Triples
 from ... base import ChunkingService, ConsumerSpec, ProducerSpec

+from ... provenance import (
+    page_uri, chunk_uri_from_page, chunk_uri_from_doc,
+    derived_entity_triples, document_uri,
+)
+
+# Component identification for provenance
+COMPONENT_NAME = "chunker"
+COMPONENT_VERSION = "1.0.0"
+
 # Module logger
 logger = logging.getLogger(__name__)

@ -63,6 +72,13 @@ class Processor(ChunkingService):
            )
        )

+        self.register_specification(
+            ProducerSpec(
+                name = "triples",
+                schema = Triples,
+            )
+        )
+
        logger.info("Recursive chunker initialized")

    async def on_message(self, msg, consumer, flow):
@ -96,21 +112,99 @@ class Processor(ChunkingService):

        texts = text_splitter.create_documents([text])

+        # Get parent document ID for provenance linking
+        parent_doc_id = v.document_id or v.metadata.id
+
+        # Determine if parent is a page (from PDF) or source document (text)
+        # Check if parent_doc_id contains "/p" which indicates a page
+        is_from_page = "/p" in parent_doc_id
+
+        # Extract the root document ID for chunk URI generation
+        if is_from_page:
+            # Parent is a page like "doc123/p3", extract page number
+            parts = parent_doc_id.rsplit("/p", 1)
+            root_doc_id = parts[0]
+            page_num = int(parts[1]) if len(parts) > 1 else 1
+        else:
+            root_doc_id = parent_doc_id
+            page_num = None
+
+        # Track character offset for provenance
+        char_offset = 0
+
        for ix, chunk in enumerate(texts):
+            chunk_index = ix + 1  # 1-indexed

            logger.debug(f"Created chunk of size {len(chunk.page_content)}")

+            # Generate chunk document ID
+            if is_from_page:
+                chunk_doc_id = f"{root_doc_id}/p{page_num}/c{chunk_index}"
+                chunk_uri = chunk_uri_from_page(root_doc_id, page_num, chunk_index)
+                parent_uri = page_uri(root_doc_id, page_num)
+            else:
+                chunk_doc_id = f"{root_doc_id}/c{chunk_index}"
+                chunk_uri = chunk_uri_from_doc(root_doc_id, chunk_index)
+                parent_uri = document_uri(root_doc_id)
+
+            chunk_content = chunk.page_content.encode("utf-8")
+            chunk_length = len(chunk.page_content)
+
+            # Save chunk to librarian as child document
+            await self.save_child_document(
+                doc_id=chunk_doc_id,
+                parent_id=parent_doc_id,
+                user=v.metadata.user,
+                content=chunk_content,
+                document_type="chunk",
+                title=f"Chunk {chunk_index}",
+            )
+
+            # Emit provenance triples
+            prov_triples = derived_entity_triples(
+                entity_uri=chunk_uri,
+                parent_uri=parent_uri,
+                component_name=COMPONENT_NAME,
+                component_version=COMPONENT_VERSION,
+                label=f"Chunk {chunk_index}",
+                chunk_index=chunk_index,
+                char_offset=char_offset,
+                char_length=chunk_length,
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+            )
+
+            await flow("triples").send(Triples(
+                metadata=Metadata(
+                    id=chunk_uri,
+                    metadata=[],
+                    user=v.metadata.user,
+                    collection=v.metadata.collection,
+                ),
+                triples=prov_triples,
+            ))
+
+            # Forward chunk ID + content (post-chunker optimization)
            r = Chunk(
-                metadata=v.metadata,
-                chunk=chunk.page_content.encode("utf-8"),
+                metadata=Metadata(
+                    id=chunk_uri,
+                    metadata=[],
+                    user=v.metadata.user,
+                    collection=v.metadata.collection,
+                ),
+                chunk=chunk_content,
+                document_id=chunk_doc_id,
            )

            __class__.chunk_metric.labels(
                id=consumer.id, flow=consumer.flow
-            ).observe(len(chunk.page_content))
+            ).observe(chunk_length)

            await flow("output").send(r)

+            # Update character offset (approximate, doesn't account for overlap)
+            char_offset += chunk_length - chunk_overlap
+
        logger.debug("Document chunking complete")

    @staticmethod
--- a/trustgraph-flow/trustgraph/decoding/pdf/pdf_decoder.py
+++ b/trustgraph-flow/trustgraph/decoding/pdf/pdf_decoder.py
@ -16,11 +16,20 @@ import uuid
 from langchain_community.document_loaders import PyPDFLoader

 from ... schema import Document, TextDocument, Metadata
-from ... schema import LibrarianRequest, LibrarianResponse
+from ... schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
 from ... schema import librarian_request_queue, librarian_response_queue
+from ... schema import Triples
 from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
 from ... base import Consumer, Producer, ConsumerMetrics, ProducerMetrics

+from ... provenance import (
+    document_uri, page_uri, derived_entity_triples,
+)
+
+# Component identification for provenance
+COMPONENT_NAME = "pdf-decoder"
+COMPONENT_VERSION = "1.0.0"
+
 # Module logger
 logger = logging.getLogger(__name__)

@ -57,6 +66,13 @@ class Processor(FlowProcessor):
            )
        )

+        self.register_specification(
+            ProducerSpec(
+                name = "triples",
+                schema = Triples,
+            )
+        )
+
        # Librarian client for fetching document content
        librarian_request_q = params.get(
            "librarian_request_queue", default_librarian_request_queue
@ -148,6 +164,66 @@ class Processor(FlowProcessor):
            self.pending_requests.pop(request_id, None)
            raise RuntimeError(f"Timeout fetching document {document_id}")

+    async def save_child_document(self, doc_id, parent_id, user, content,
+                                   document_type="page", title=None, timeout=120):
+        """
+        Save a child document to the librarian.
+
+        Args:
+            doc_id: ID for the new child document
+            parent_id: ID of the parent document
+            user: User ID
+            content: Document content (bytes)
+            document_type: Type of document ("page", "chunk", etc.)
+            title: Optional title
+            timeout: Request timeout in seconds
+
+        Returns:
+            The document ID on success
+        """
+        import base64
+
+        request_id = str(uuid.uuid4())
+
+        doc_metadata = DocumentMetadata(
+            id=doc_id,
+            user=user,
+            kind="text/plain",
+            title=title or doc_id,
+            parent_id=parent_id,
+            document_type=document_type,
+        )
+
+        request = LibrarianRequest(
+            operation="add-child-document",
+            document_metadata=doc_metadata,
+            content=base64.b64encode(content).decode("utf-8"),
+        )
+
+        # Create future for response
+        future = asyncio.get_event_loop().create_future()
+        self.pending_requests[request_id] = future
+
+        try:
+            # Send request
+            await self.librarian_request_producer.send(
+                request, properties={"id": request_id}
+            )
+
+            # Wait for response
+            response = await asyncio.wait_for(future, timeout=timeout)
+
+            if response.error:
+                raise RuntimeError(
+                    f"Librarian error saving child document: {response.error.type}: {response.error.message}"
+                )
+
+            return doc_id
+
+        except asyncio.TimeoutError:
+            self.pending_requests.pop(request_id, None)
+            raise RuntimeError(f"Timeout saving child document {doc_id}")
+
    async def on_message(self, msg, consumer, flow):

        logger.debug("PDF message received")
@ -187,13 +263,62 @@ class Processor(FlowProcessor):
            loader = PyPDFLoader(temp_path)
            pages = loader.load()

+            # Get the source document ID
+            source_doc_id = v.document_id or v.metadata.id
+
            for ix, page in enumerate(pages):
+                page_num = ix + 1  # 1-indexed page numbers

-                logger.debug(f"Processing page {ix}")
+                logger.debug(f"Processing page {page_num}")

+                # Generate page document ID
+                page_doc_id = f"{source_doc_id}/p{page_num}"
+                page_content = page.page_content.encode("utf-8")
+
+                # Save page as child document in librarian
+                await self.save_child_document(
+                    doc_id=page_doc_id,
+                    parent_id=source_doc_id,
+                    user=v.metadata.user,
+                    content=page_content,
+                    document_type="page",
+                    title=f"Page {page_num}",
+                )
+
+                # Emit provenance triples
+                doc_uri = document_uri(source_doc_id)
+                pg_uri = page_uri(source_doc_id, page_num)
+
+                prov_triples = derived_entity_triples(
+                    entity_uri=pg_uri,
+                    parent_uri=doc_uri,
+                    component_name=COMPONENT_NAME,
+                    component_version=COMPONENT_VERSION,
+                    label=f"Page {page_num}",
+                    page_number=page_num,
+                )
+
+                await flow("triples").send(Triples(
+                    metadata=Metadata(
+                        id=pg_uri,
+                        metadata=[],
+                        user=v.metadata.user,
+                        collection=v.metadata.collection,
+                    ),
+                    triples=prov_triples,
+                ))
+
+                # Forward page document ID to chunker
+                # Chunker will fetch content from librarian
                r = TextDocument(
-                    metadata=v.metadata,
-                    text=page.page_content.encode("utf-8"),
+                    metadata=Metadata(
+                        id=pg_uri,
+                        metadata=[],
+                        user=v.metadata.user,
+                        collection=v.metadata.collection,
+                    ),
+                    document_id=page_doc_id,
+                    text=b"",  # Empty, chunker will fetch from librarian
                )

                await flow("output").send(r)
--- a/trustgraph-flow/trustgraph/embeddings/graph_embeddings/embeddings.py
+++ b/trustgraph-flow/trustgraph/embeddings/graph_embeddings/embeddings.py
@ -71,7 +71,8 @@ class Processor(FlowProcessor):
                entities.append(
                    EntityEmbeddings(
                        entity=entity.entity,
-                        vectors=vectors
+                        vectors=vectors,
+                        chunk_id=entity.chunk_id,  # Provenance: source chunk
                    )
                )

--- a/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/definitions/extract.py
@ -128,10 +128,12 @@ class Processor(FlowProcessor):
            triples = []
            entities = []

-            # FIXME: Putting metadata into triples store is duplicated in
-            # relationships extractor too
-            for t in v.metadata.metadata:
-                triples.append(t)
+            # Get chunk document ID for provenance linking
+            chunk_doc_id = v.document_id if v.document_id else v.metadata.id
+            chunk_uri = v.metadata.id  # The URI form for the chunk
+
+            # Note: Document metadata is now emitted once by librarian at processing
+            # initiation, so we don't need to duplicate it here.

            for defn in defs:

@ -159,22 +161,27 @@ class Processor(FlowProcessor):
                    s=s_value, p=DEFINITION_VALUE, o=o_value
                ))

+                # Link entity to chunk (not top-level document)
                triples.append(Triple(
                    s=s_value,
                    p=SUBJECT_OF_VALUE,
-                    o=Term(type=IRI, iri=v.metadata.id)
+                    o=Term(type=IRI, iri=chunk_uri)
                ))

                # Output entity name as context for direct name matching
+                # Include chunk_id for embedding provenance
                entities.append(EntityContext(
                    entity=s_value,
                    context=s,
+                    chunk_id=chunk_doc_id,
                ))

                # Output definition as context for semantic matching
+                # Include chunk_id for embedding provenance
                entities.append(EntityContext(
                    entity=s_value,
                    context=defn["definition"],
+                    chunk_id=chunk_doc_id,
                ))

            # Send triples in batches
--- a/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/relationships/extract.py
@ -109,10 +109,12 @@ class Processor(FlowProcessor):

            triples = []

-            # FIXME: Putting metadata into triples store is duplicated in
-            # relationships extractor too
-            for t in v.metadata.metadata:
-                triples.append(t)
+            # Get chunk document ID for provenance linking
+            chunk_doc_id = v.document_id if v.document_id else v.metadata.id
+            chunk_uri = v.metadata.id  # The URI form for the chunk
+
+            # Note: Document metadata is now emitted once by librarian at processing
+            # initiation, so we don't need to duplicate it here.

            for rel in rels:

@ -168,19 +170,19 @@ class Processor(FlowProcessor):
                        o=Term(type=LITERAL, value=str(o))
                    ))

-                # 'Subject of' for s
+                # Link entity to chunk (not top-level document)
                triples.append(Triple(
                    s=s_value,
                    p=SUBJECT_OF_VALUE,
-                    o=Term(type=IRI, iri=v.metadata.id)
+                    o=Term(type=IRI, iri=chunk_uri)
                ))

                if rel["object-entity"]:
-                    # 'Subject of' for o
+                    # Link object entity to chunk
                    triples.append(Triple(
                        s=o_value,
                        p=SUBJECT_OF_VALUE,
-                        o=Term(type=IRI, iri=v.metadata.id)
+                        o=Term(type=IRI, iri=chunk_uri)
                    ))

            # Send triples in batches
--- a/trustgraph-flow/trustgraph/librarian/librarian.py
+++ b/trustgraph-flow/trustgraph/librarian/librarian.py
@ -609,8 +609,10 @@ class Librarian:
        ):
            raise RequestError("Document already exists")

-        # Ensure document_type is set to "extracted"
-        request.document_metadata.document_type = "extracted"
+        # Set document_type if not specified by caller
+        # Valid types: "page", "chunk", or "extracted" (legacy)
+        if not request.document_metadata.document_type or request.document_metadata.document_type == "source":
+            request.document_metadata.document_type = "extracted"

        # Create object ID for blob
        object_id = uuid.uuid4()
--- a/trustgraph-flow/trustgraph/librarian/service.py
+++ b/trustgraph-flow/trustgraph/librarian/service.py
@ -23,9 +23,14 @@ from .. schema import config_request_queue, config_response_queue

 from .. schema import Document, Metadata
 from .. schema import TextDocument, Metadata
+from .. schema import Triples

 from .. exceptions import RequestError

+from .. provenance import (
+    document_uri, document_triples, get_vocabulary_triples,
+)
+
 from . librarian import Librarian
 from . collection_manager import CollectionManager

@ -281,6 +286,67 @@ class Processor(AsyncProcessor):
    # Threshold for sending document_id instead of inline content (2MB)
    STREAMING_THRESHOLD = 2 * 1024 * 1024

+    async def emit_document_provenance(self, document, processing, triples_queue):
+        """
+        Emit document provenance metadata to the knowledge graph.
+
+        This emits:
+        1. Vocabulary bootstrap triples (idempotent, safe to re-emit)
+        2. Document metadata as PROV-O triples
+        """
+        logger.debug(f"Emitting document provenance for {document.id}")
+
+        # Build document URI and provenance triples
+        doc_uri = document_uri(document.id)
+
+        # Get page count for PDFs (if available from document metadata)
+        page_count = None
+        if document.kind == "application/pdf":
+            # Page count might be in document metadata triples
+            # For now, we don't have it at this point - it gets determined during extraction
+            pass
+
+        # Build document metadata triples
+        prov_triples = document_triples(
+            doc_uri=doc_uri,
+            title=document.title if document.title else None,
+            mime_type=document.kind,
+        )
+
+        # Include any existing metadata triples from the document
+        if document.metadata:
+            prov_triples.extend(document.metadata)
+
+        # Get vocabulary bootstrap triples (idempotent)
+        vocab_triples = get_vocabulary_triples()
+
+        # Combine all triples
+        all_triples = vocab_triples + prov_triples
+
+        # Create publisher and emit
+        triples_pub = Publisher(
+            self.pubsub, triples_queue, schema=Triples
+        )
+
+        try:
+            await triples_pub.start()
+
+            triples_msg = Triples(
+                metadata=Metadata(
+                    id=doc_uri,
+                    metadata=[],
+                    user=processing.user,
+                    collection=processing.collection,
+                ),
+                triples=all_triples,
+            )
+
+            await triples_pub.send(None, triples_msg)
+            logger.debug(f"Emitted {len(all_triples)} provenance triples for {document.id}")
+
+        finally:
+            await triples_pub.stop()
+
    async def load_document(self, document, processing, content):

        logger.debug("Ready for document processing...")
@ -301,6 +367,12 @@ class Processor(AsyncProcessor):

        q = flow["interfaces"][kind]

+        # Emit document provenance to knowledge graph
+        if "triples-store" in flow["interfaces"]:
+            await self.emit_document_provenance(
+                document, processing, flow["interfaces"]["triples-store"]
+            )
+
        if kind == "text-load":
            # For large text documents, send document_id for streaming retrieval
            if len(content) >= self.STREAMING_THRESHOLD: