Extract-time provenance (#661)

1. Shared Provenance Module - URI generators, namespace constants, triple builders, vocabulary bootstrap 2. Librarian - Emits document metadata to graph on processing initiation (vocabulary bootstrap + PROV-O triples) 3. PDF Extractor - Saves pages as child documents, emits parent-child provenance edges, forwards page IDs 4. Chunker - Saves chunks as child documents, emits provenance edges, forwards chunk ID + content 5. Knowledge Extractors (both definitions and relationships): - Link entities to chunks via SUBJECT_OF (not top-level document) - Removed duplicate metadata emission (now handled by librarian) - Get chunk_doc_id and chunk_uri from incoming Chunk message 6. Embedding Provenance: - EntityContext schema has chunk_id field - EntityEmbeddings schema has chunk_id field - Definitions extractor sets chunk_id when creating EntityContext - Graph embeddings processor passes chunk_id through to EntityEmbeddings Provenance Flow: Document → Page (PDF) → Chunk → Extracted Facts/Embeddings ↓ ↓ ↓ ↓ librarian librarian librarian (chunk_id reference) + graph + graph + graph Each artifact is stored in librarian with parent-child linking, and PROV-O edges are emitted to the knowledge graph for full traceability from any extracted fact back to its source document. Also, updating tests
2026-06-17 02:45:14 +02:00 · 2026-03-05 18:36:10 +00:00 · 2026-03-05 18:36:10 +00:00 · cd5580be59
commit cd5580be59
parent d8f0a576af
20 changed files with 1601 additions and 59 deletions
--- a/tests/unit/test_chunking/test_recursive_chunker.py
+++ b/tests/unit/test_chunking/test_recursive_chunker.py
@ -176,6 +176,9 @@ class TestRecursiveChunkerSimple(IsolatedAsyncioTestCase):

        processor = Processor(**config)

+        # Mock save_child_document to avoid waiting for librarian response
+        processor.save_child_document = AsyncMock(return_value="mock-doc-id")
+
        # Mock message with TextDocument
        mock_message = MagicMock()
        mock_text_doc = MagicMock()
@ -192,11 +195,13 @@ class TestRecursiveChunkerSimple(IsolatedAsyncioTestCase):
        # Mock consumer and flow with parameter overrides
        mock_consumer = MagicMock()
        mock_producer = AsyncMock()
+        mock_triples_producer = AsyncMock()
        mock_flow = MagicMock()
        mock_flow.side_effect = lambda param: {
            "chunk-size": 1500,
            "chunk-overlap": 150,
-            "output": mock_producer
+            "output": mock_producer,
+            "triples": mock_triples_producer,
        }.get(param)

        # Act
--- a/tests/unit/test_decoding/test_pdf_decoder.py
+++ b/tests/unit/test_decoding/test_pdf_decoder.py
@ -69,9 +69,13 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
        mock_msg = MagicMock()
        mock_msg.value.return_value = mock_document

-        # Mock flow
+        # Mock flow - separate mocks for output and triples
        mock_output_flow = AsyncMock()
-        mock_flow = MagicMock(return_value=mock_output_flow)
+        mock_triples_flow = AsyncMock()
+        mock_flow = MagicMock(side_effect=lambda name: {
+            "output": mock_output_flow,
+            "triples": mock_triples_flow,
+        }.get(name))

        config = {
            'id': 'test-pdf-decoder',
@ -80,10 +84,15 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):

        processor = Processor(**config)

+        # Mock save_child_document to avoid waiting for librarian response
+        processor.save_child_document = AsyncMock(return_value="mock-doc-id")
+
        await processor.on_message(mock_msg, None, mock_flow)

        # Verify output was sent for each page
        assert mock_output_flow.send.call_count == 2
+        # Verify triples were sent for each page (provenance)
+        assert mock_triples_flow.send.call_count == 2

    @patch('trustgraph.base.chunking_service.Consumer')
    @patch('trustgraph.base.chunking_service.Producer')
@ -140,8 +149,13 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
        mock_msg = MagicMock()
        mock_msg.value.return_value = mock_document

+        # Mock flow - separate mocks for output and triples
        mock_output_flow = AsyncMock()
-        mock_flow = MagicMock(return_value=mock_output_flow)
+        mock_triples_flow = AsyncMock()
+        mock_flow = MagicMock(side_effect=lambda name: {
+            "output": mock_output_flow,
+            "triples": mock_triples_flow,
+        }.get(name))

        config = {
            'id': 'test-pdf-decoder',
@ -150,11 +164,16 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):

        processor = Processor(**config)

+        # Mock save_child_document to avoid waiting for librarian response
+        processor.save_child_document = AsyncMock(return_value="mock-doc-id")
+
        await processor.on_message(mock_msg, None, mock_flow)

        mock_output_flow.send.assert_called_once()
        call_args = mock_output_flow.send.call_args[0][0]
-        assert call_args.text == "Page with unicode: 你好世界 🌍".encode('utf-8')
+        # PDF decoder now forwards document_id, chunker fetches content from librarian
+        assert call_args.document_id == "test-doc/p1"
+        assert call_args.text == b""  # Content stored in librarian, not inline

    @patch('trustgraph.base.flow_processor.FlowProcessor.add_args')
    def test_add_args(self, mock_parent_add_args):