mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Extract-time provenance (#661)
1. Shared Provenance Module - URI generators, namespace constants,
triple builders, vocabulary bootstrap
2. Librarian - Emits document metadata to graph on processing
initiation (vocabulary bootstrap + PROV-O triples)
3. PDF Extractor - Saves pages as child documents, emits parent-child
provenance edges, forwards page IDs
4. Chunker - Saves chunks as child documents, emits provenance edges,
forwards chunk ID + content
5. Knowledge Extractors (both definitions and relationships):
- Link entities to chunks via SUBJECT_OF (not top-level document)
- Removed duplicate metadata emission (now handled by librarian)
- Get chunk_doc_id and chunk_uri from incoming Chunk message
6. Embedding Provenance:
- EntityContext schema has chunk_id field
- EntityEmbeddings schema has chunk_id field
- Definitions extractor sets chunk_id when creating EntityContext
- Graph embeddings processor passes chunk_id through to
EntityEmbeddings
Provenance Flow:
Document → Page (PDF) → Chunk → Extracted Facts/Embeddings
↓ ↓ ↓ ↓
librarian librarian librarian (chunk_id reference)
+ graph + graph + graph
Each artifact is stored in librarian with parent-child linking, and PROV-O
edges are emitted to the knowledge graph for full traceability from any
extracted fact back to its source document.
Also, updating tests
This commit is contained in:
parent
d8f0a576af
commit
cd5580be59
20 changed files with 1601 additions and 59 deletions
|
|
@ -176,6 +176,9 @@ class TestRecursiveChunkerSimple(IsolatedAsyncioTestCase):
|
|||
|
||||
processor = Processor(**config)
|
||||
|
||||
# Mock save_child_document to avoid waiting for librarian response
|
||||
processor.save_child_document = AsyncMock(return_value="mock-doc-id")
|
||||
|
||||
# Mock message with TextDocument
|
||||
mock_message = MagicMock()
|
||||
mock_text_doc = MagicMock()
|
||||
|
|
@ -192,11 +195,13 @@ class TestRecursiveChunkerSimple(IsolatedAsyncioTestCase):
|
|||
# Mock consumer and flow with parameter overrides
|
||||
mock_consumer = MagicMock()
|
||||
mock_producer = AsyncMock()
|
||||
mock_triples_producer = AsyncMock()
|
||||
mock_flow = MagicMock()
|
||||
mock_flow.side_effect = lambda param: {
|
||||
"chunk-size": 1500,
|
||||
"chunk-overlap": 150,
|
||||
"output": mock_producer
|
||||
"output": mock_producer,
|
||||
"triples": mock_triples_producer,
|
||||
}.get(param)
|
||||
|
||||
# Act
|
||||
|
|
|
|||
|
|
@ -69,9 +69,13 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
|
|||
mock_msg = MagicMock()
|
||||
mock_msg.value.return_value = mock_document
|
||||
|
||||
# Mock flow
|
||||
# Mock flow - separate mocks for output and triples
|
||||
mock_output_flow = AsyncMock()
|
||||
mock_flow = MagicMock(return_value=mock_output_flow)
|
||||
mock_triples_flow = AsyncMock()
|
||||
mock_flow = MagicMock(side_effect=lambda name: {
|
||||
"output": mock_output_flow,
|
||||
"triples": mock_triples_flow,
|
||||
}.get(name))
|
||||
|
||||
config = {
|
||||
'id': 'test-pdf-decoder',
|
||||
|
|
@ -80,10 +84,15 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
|
|||
|
||||
processor = Processor(**config)
|
||||
|
||||
# Mock save_child_document to avoid waiting for librarian response
|
||||
processor.save_child_document = AsyncMock(return_value="mock-doc-id")
|
||||
|
||||
await processor.on_message(mock_msg, None, mock_flow)
|
||||
|
||||
# Verify output was sent for each page
|
||||
assert mock_output_flow.send.call_count == 2
|
||||
# Verify triples were sent for each page (provenance)
|
||||
assert mock_triples_flow.send.call_count == 2
|
||||
|
||||
@patch('trustgraph.base.chunking_service.Consumer')
|
||||
@patch('trustgraph.base.chunking_service.Producer')
|
||||
|
|
@ -140,8 +149,13 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
|
|||
mock_msg = MagicMock()
|
||||
mock_msg.value.return_value = mock_document
|
||||
|
||||
# Mock flow - separate mocks for output and triples
|
||||
mock_output_flow = AsyncMock()
|
||||
mock_flow = MagicMock(return_value=mock_output_flow)
|
||||
mock_triples_flow = AsyncMock()
|
||||
mock_flow = MagicMock(side_effect=lambda name: {
|
||||
"output": mock_output_flow,
|
||||
"triples": mock_triples_flow,
|
||||
}.get(name))
|
||||
|
||||
config = {
|
||||
'id': 'test-pdf-decoder',
|
||||
|
|
@ -150,11 +164,16 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
|
|||
|
||||
processor = Processor(**config)
|
||||
|
||||
# Mock save_child_document to avoid waiting for librarian response
|
||||
processor.save_child_document = AsyncMock(return_value="mock-doc-id")
|
||||
|
||||
await processor.on_message(mock_msg, None, mock_flow)
|
||||
|
||||
mock_output_flow.send.assert_called_once()
|
||||
call_args = mock_output_flow.send.call_args[0][0]
|
||||
assert call_args.text == "Page with unicode: 你好世界 🌍".encode('utf-8')
|
||||
# PDF decoder now forwards document_id, chunker fetches content from librarian
|
||||
assert call_args.document_id == "test-doc/p1"
|
||||
assert call_args.text == b"" # Content stored in librarian, not inline
|
||||
|
||||
@patch('trustgraph.base.flow_processor.FlowProcessor.add_args')
|
||||
def test_add_args(self, mock_parent_add_args):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue