mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Use UUID-based URNs for page and chunk IDs (#703)
Page and chunk document IDs were deterministic ({doc_id}/p{num},
{doc_id}/p{num}/c{num}), causing "Document already exists" errors
when reprocessing documents through different flows. Content may
differ between runs due to different parameters or extractors, so
deterministic IDs are incorrect.
Pages now use urn:page:{uuid}, chunks use
urn:chunk:{uuid}. Parent- child relationships are tracked via
librarian metadata and provenance triples.
Also brings Mistral OCR and Tesseract OCR decoders up to parity
with the PDF decoder: librarian fetch/save support, per-page
output with unique IDs, and provenance triple emission. Fixes
Mistral OCR bug where only the first 5 pages were processed.
This commit is contained in:
parent
1a7b654bd3
commit
96fd1eab15
10 changed files with 694 additions and 286 deletions
|
|
@ -10,159 +10,137 @@ from unittest import IsolatedAsyncioTestCase
|
|||
from io import BytesIO
|
||||
|
||||
from trustgraph.decoding.mistral_ocr.processor import Processor
|
||||
from trustgraph.schema import Document, TextDocument, Metadata
|
||||
from trustgraph.schema import Document, TextDocument, Metadata, Triples
|
||||
|
||||
|
||||
class MockAsyncProcessor:
|
||||
def __init__(self, **params):
|
||||
self.config_handlers = []
|
||||
self.id = params.get('id', 'test-service')
|
||||
self.specifications = []
|
||||
self.pubsub = MagicMock()
|
||||
self.taskgroup = params.get('taskgroup', MagicMock())
|
||||
|
||||
|
||||
class TestMistralOcrProcessor(IsolatedAsyncioTestCase):
|
||||
"""Test Mistral OCR processor functionality"""
|
||||
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.Consumer')
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.Producer')
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.Mistral')
|
||||
@patch('trustgraph.base.flow_processor.FlowProcessor.__init__')
|
||||
async def test_processor_initialization_with_api_key(self, mock_flow_init, mock_mistral_class):
|
||||
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
||||
async def test_processor_initialization_with_api_key(
|
||||
self, mock_mistral_class, mock_producer, mock_consumer
|
||||
):
|
||||
"""Test Mistral OCR processor initialization with API key"""
|
||||
# Arrange
|
||||
mock_flow_init.return_value = None
|
||||
mock_mistral = MagicMock()
|
||||
mock_mistral_class.return_value = mock_mistral
|
||||
|
||||
mock_mistral_class.return_value = MagicMock()
|
||||
|
||||
config = {
|
||||
'id': 'test-mistral-ocr',
|
||||
'api_key': 'test-api-key',
|
||||
'taskgroup': AsyncMock()
|
||||
}
|
||||
|
||||
# Act
|
||||
with patch.object(Processor, 'register_specification') as mock_register:
|
||||
processor = Processor(**config)
|
||||
processor = Processor(**config)
|
||||
|
||||
# Assert
|
||||
mock_flow_init.assert_called_once()
|
||||
mock_mistral_class.assert_called_once_with(api_key='test-api-key')
|
||||
|
||||
# Verify register_specification was called twice (consumer and producer)
|
||||
assert mock_register.call_count == 2
|
||||
|
||||
# Check consumer spec
|
||||
consumer_call = mock_register.call_args_list[0]
|
||||
consumer_spec = consumer_call[0][0]
|
||||
assert consumer_spec.name == "input"
|
||||
assert consumer_spec.schema == Document
|
||||
assert consumer_spec.handler == processor.on_message
|
||||
|
||||
# Check producer spec
|
||||
producer_call = mock_register.call_args_list[1]
|
||||
producer_spec = producer_call[0][0]
|
||||
assert producer_spec.name == "output"
|
||||
assert producer_spec.schema == TextDocument
|
||||
|
||||
@patch('trustgraph.base.flow_processor.FlowProcessor.__init__')
|
||||
async def test_processor_initialization_without_api_key(self, mock_flow_init):
|
||||
# Check specs registered: input consumer, output producer, triples producer
|
||||
consumer_specs = [s for s in processor.specifications if hasattr(s, 'handler')]
|
||||
assert len(consumer_specs) >= 1
|
||||
assert consumer_specs[0].name == "input"
|
||||
assert consumer_specs[0].schema == Document
|
||||
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.Consumer')
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.Producer')
|
||||
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
||||
async def test_processor_initialization_without_api_key(
|
||||
self, mock_producer, mock_consumer
|
||||
):
|
||||
"""Test Mistral OCR processor initialization without API key raises error"""
|
||||
# Arrange
|
||||
mock_flow_init.return_value = None
|
||||
|
||||
config = {
|
||||
'id': 'test-mistral-ocr',
|
||||
'taskgroup': AsyncMock()
|
||||
}
|
||||
|
||||
# Act & Assert
|
||||
with patch.object(Processor, 'register_specification'):
|
||||
with pytest.raises(RuntimeError, match="Mistral API key not specified"):
|
||||
processor = Processor(**config)
|
||||
with pytest.raises(RuntimeError, match="Mistral API key not specified"):
|
||||
Processor(**config)
|
||||
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.uuid.uuid4')
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.Consumer')
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.Producer')
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.Mistral')
|
||||
@patch('trustgraph.base.flow_processor.FlowProcessor.__init__')
|
||||
async def test_ocr_single_chunk(self, mock_flow_init, mock_mistral_class, mock_uuid):
|
||||
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
||||
async def test_ocr_single_chunk(
|
||||
self, mock_mistral_class, mock_producer, mock_consumer
|
||||
):
|
||||
"""Test OCR processing with a single chunk (less than 5 pages)"""
|
||||
# Arrange
|
||||
mock_flow_init.return_value = None
|
||||
mock_uuid.return_value = "test-uuid-1234"
|
||||
|
||||
# Mock Mistral client
|
||||
mock_mistral = MagicMock()
|
||||
mock_mistral_class.return_value = mock_mistral
|
||||
|
||||
|
||||
# Mock file upload
|
||||
mock_uploaded_file = MagicMock(id="file-123")
|
||||
mock_mistral.files.upload.return_value = mock_uploaded_file
|
||||
|
||||
|
||||
# Mock signed URL
|
||||
mock_signed_url = MagicMock(url="https://example.com/signed-url")
|
||||
mock_mistral.files.get_signed_url.return_value = mock_signed_url
|
||||
|
||||
# Mock OCR response
|
||||
mock_page = MagicMock(
|
||||
|
||||
# Mock OCR response with 2 pages
|
||||
mock_page1 = MagicMock(
|
||||
markdown="# Page 1\nContent ",
|
||||
images=[MagicMock(id="img1", image_base64="data:image/png;base64,abc123")]
|
||||
)
|
||||
mock_ocr_response = MagicMock(pages=[mock_page])
|
||||
mock_page2 = MagicMock(
|
||||
markdown="# Page 2\nMore content",
|
||||
images=[]
|
||||
)
|
||||
mock_ocr_response = MagicMock(pages=[mock_page1, mock_page2])
|
||||
mock_mistral.ocr.process.return_value = mock_ocr_response
|
||||
|
||||
|
||||
# Mock PyPDF
|
||||
mock_pdf_reader = MagicMock()
|
||||
mock_pdf_reader.pages = [MagicMock(), MagicMock(), MagicMock()] # 3 pages
|
||||
|
||||
mock_pdf_reader.pages = [MagicMock(), MagicMock(), MagicMock()]
|
||||
|
||||
config = {
|
||||
'id': 'test-mistral-ocr',
|
||||
'api_key': 'test-api-key',
|
||||
'taskgroup': AsyncMock()
|
||||
}
|
||||
|
||||
with patch.object(Processor, 'register_specification'):
|
||||
with patch('trustgraph.decoding.mistral_ocr.processor.PdfReader', return_value=mock_pdf_reader):
|
||||
with patch('trustgraph.decoding.mistral_ocr.processor.PdfWriter') as mock_pdf_writer_class:
|
||||
mock_pdf_writer = MagicMock()
|
||||
mock_pdf_writer_class.return_value = mock_pdf_writer
|
||||
|
||||
processor = Processor(**config)
|
||||
|
||||
# Act
|
||||
result = processor.ocr(b"fake pdf content")
|
||||
with patch('trustgraph.decoding.mistral_ocr.processor.PdfReader', return_value=mock_pdf_reader):
|
||||
with patch('trustgraph.decoding.mistral_ocr.processor.PdfWriter') as mock_pdf_writer_class:
|
||||
mock_pdf_writer = MagicMock()
|
||||
mock_pdf_writer_class.return_value = mock_pdf_writer
|
||||
|
||||
# Assert
|
||||
assert result == "# Page 1\nContent "
|
||||
|
||||
# Verify PDF writer was used to create chunk
|
||||
processor = Processor(**config)
|
||||
result = processor.ocr(b"fake pdf content")
|
||||
|
||||
# Returns list of (markdown, page_num) tuples
|
||||
assert len(result) == 2
|
||||
assert result[0] == ("# Page 1\nContent ", 1)
|
||||
assert result[1] == ("# Page 2\nMore content", 2)
|
||||
|
||||
# Verify PDF writer was used
|
||||
assert mock_pdf_writer.add_page.call_count == 3
|
||||
mock_pdf_writer.write_stream.assert_called_once()
|
||||
|
||||
|
||||
# Verify Mistral API calls
|
||||
mock_mistral.files.upload.assert_called_once()
|
||||
upload_call = mock_mistral.files.upload.call_args[1]
|
||||
assert upload_call['file']['file_name'] == "test-uuid-1234"
|
||||
assert upload_call['purpose'] == 'ocr'
|
||||
|
||||
mock_mistral.files.get_signed_url.assert_called_once_with(
|
||||
file_id="file-123", expiry=1
|
||||
)
|
||||
|
||||
mock_mistral.ocr.process.assert_called_once_with(
|
||||
model="mistral-ocr-latest",
|
||||
include_image_base64=True,
|
||||
document={
|
||||
"type": "document_url",
|
||||
"document_url": "https://example.com/signed-url",
|
||||
}
|
||||
)
|
||||
mock_mistral.ocr.process.assert_called_once()
|
||||
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.uuid.uuid4')
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.Consumer')
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.Producer')
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.Mistral')
|
||||
@patch('trustgraph.base.flow_processor.FlowProcessor.__init__')
|
||||
async def test_on_message_success(self, mock_flow_init, mock_mistral_class, mock_uuid):
|
||||
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
||||
async def test_on_message_success(
|
||||
self, mock_mistral_class, mock_producer, mock_consumer
|
||||
):
|
||||
"""Test successful message processing"""
|
||||
# Arrange
|
||||
mock_flow_init.return_value = None
|
||||
mock_uuid.return_value = "test-uuid-5678"
|
||||
|
||||
# Mock Mistral client with simple OCR response
|
||||
mock_mistral = MagicMock()
|
||||
mock_mistral_class.return_value = mock_mistral
|
||||
|
||||
# Mock the ocr method to return simple markdown
|
||||
ocr_result = "# Document Title\nThis is the OCR content"
|
||||
|
||||
mock_mistral_class.return_value = MagicMock()
|
||||
|
||||
# Mock message
|
||||
pdf_content = b"fake pdf content"
|
||||
pdf_base64 = base64.b64encode(pdf_content).decode('utf-8')
|
||||
|
|
@ -170,126 +148,100 @@ class TestMistralOcrProcessor(IsolatedAsyncioTestCase):
|
|||
mock_document = Document(metadata=mock_metadata, data=pdf_base64)
|
||||
mock_msg = MagicMock()
|
||||
mock_msg.value.return_value = mock_document
|
||||
|
||||
# Mock flow - needs to be a callable that returns an object with send method
|
||||
|
||||
# Mock flow
|
||||
mock_output_flow = AsyncMock()
|
||||
mock_flow = MagicMock(return_value=mock_output_flow)
|
||||
|
||||
mock_triples_flow = AsyncMock()
|
||||
mock_flow = MagicMock(side_effect=lambda name: {
|
||||
"output": mock_output_flow,
|
||||
"triples": mock_triples_flow,
|
||||
}.get(name))
|
||||
|
||||
config = {
|
||||
'id': 'test-mistral-ocr',
|
||||
'api_key': 'test-api-key',
|
||||
'taskgroup': AsyncMock()
|
||||
}
|
||||
|
||||
with patch.object(Processor, 'register_specification'):
|
||||
processor = Processor(**config)
|
||||
|
||||
# Mock the ocr method
|
||||
with patch.object(processor, 'ocr', return_value=ocr_result):
|
||||
# Act
|
||||
await processor.on_message(mock_msg, None, mock_flow)
|
||||
processor = Processor(**config)
|
||||
|
||||
# Assert
|
||||
# Verify output was sent
|
||||
mock_output_flow.send.assert_called_once()
|
||||
|
||||
# Check output
|
||||
call_args = mock_output_flow.send.call_args[0][0]
|
||||
# Mock ocr to return per-page results
|
||||
ocr_result = [
|
||||
("# Page 1\nContent", 1),
|
||||
("# Page 2\nMore content", 2),
|
||||
]
|
||||
|
||||
# Mock save_child_document
|
||||
processor.save_child_document = AsyncMock(return_value="mock-doc-id")
|
||||
|
||||
with patch.object(processor, 'ocr', return_value=ocr_result):
|
||||
await processor.on_message(mock_msg, None, mock_flow)
|
||||
|
||||
# Verify output was sent for each page
|
||||
assert mock_output_flow.send.call_count == 2
|
||||
# Verify triples were sent for each page
|
||||
assert mock_triples_flow.send.call_count == 2
|
||||
|
||||
# Check output uses UUID-based page URNs
|
||||
call_args = mock_output_flow.send.call_args_list[0][0][0]
|
||||
assert isinstance(call_args, TextDocument)
|
||||
assert call_args.metadata == mock_metadata
|
||||
assert call_args.text == ocr_result.encode('utf-8')
|
||||
assert call_args.document_id.startswith("urn:page:")
|
||||
assert call_args.text == b"" # Content stored in librarian
|
||||
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.Mistral')
|
||||
@patch('trustgraph.base.flow_processor.FlowProcessor.__init__')
|
||||
async def test_chunks_function(self, mock_flow_init, mock_mistral_class):
|
||||
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
||||
async def test_chunks_function(self, mock_mistral_class):
|
||||
"""Test the chunks utility function"""
|
||||
# Arrange
|
||||
from trustgraph.decoding.mistral_ocr.processor import chunks
|
||||
|
||||
|
||||
test_list = list(range(12))
|
||||
|
||||
# Act
|
||||
result = list(chunks(test_list, 5))
|
||||
|
||||
# Assert
|
||||
|
||||
assert len(result) == 3
|
||||
assert result[0] == [0, 1, 2, 3, 4]
|
||||
assert result[1] == [5, 6, 7, 8, 9]
|
||||
assert result[2] == [10, 11]
|
||||
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.Mistral')
|
||||
@patch('trustgraph.base.flow_processor.FlowProcessor.__init__')
|
||||
async def test_replace_images_in_markdown(self, mock_flow_init, mock_mistral_class):
|
||||
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
|
||||
async def test_replace_images_in_markdown(self, mock_mistral_class):
|
||||
"""Test the replace_images_in_markdown function"""
|
||||
# Arrange
|
||||
from trustgraph.decoding.mistral_ocr.processor import replace_images_in_markdown
|
||||
|
||||
|
||||
markdown = "# Title\n\nSome text\n"
|
||||
images_dict = {
|
||||
"image1": "data:image/png;base64,abc123",
|
||||
"image2": "data:image/png;base64,def456"
|
||||
}
|
||||
|
||||
# Act
|
||||
result = replace_images_in_markdown(markdown, images_dict)
|
||||
|
||||
# Assert
|
||||
expected = "# Title\n\nSome text\n"
|
||||
assert result == expected
|
||||
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.Mistral')
|
||||
@patch('trustgraph.base.flow_processor.FlowProcessor.__init__')
|
||||
async def test_get_combined_markdown(self, mock_flow_init, mock_mistral_class):
|
||||
"""Test the get_combined_markdown function"""
|
||||
# Arrange
|
||||
from trustgraph.decoding.mistral_ocr.processor import get_combined_markdown
|
||||
from mistralai.models import OCRResponse
|
||||
|
||||
# Mock OCR response with multiple pages
|
||||
mock_page1 = MagicMock(
|
||||
markdown="# Page 1\n",
|
||||
images=[MagicMock(id="img1", image_base64="base64_img1")]
|
||||
)
|
||||
mock_page2 = MagicMock(
|
||||
markdown="# Page 2\n",
|
||||
images=[MagicMock(id="img2", image_base64="base64_img2")]
|
||||
)
|
||||
mock_ocr_response = MagicMock(pages=[mock_page1, mock_page2])
|
||||
|
||||
# Act
|
||||
result = get_combined_markdown(mock_ocr_response)
|
||||
|
||||
# Assert
|
||||
expected = "# Page 1\n\n\n# Page 2\n"
|
||||
result = replace_images_in_markdown(markdown, images_dict)
|
||||
|
||||
expected = "# Title\n\nSome text\n"
|
||||
assert result == expected
|
||||
|
||||
@patch('trustgraph.base.flow_processor.FlowProcessor.add_args')
|
||||
def test_add_args(self, mock_parent_add_args):
|
||||
"""Test add_args adds API key argument"""
|
||||
# Arrange
|
||||
"""Test add_args adds expected arguments"""
|
||||
mock_parser = MagicMock()
|
||||
|
||||
# Act
|
||||
|
||||
Processor.add_args(mock_parser)
|
||||
|
||||
# Assert
|
||||
|
||||
mock_parent_add_args.assert_called_once_with(mock_parser)
|
||||
mock_parser.add_argument.assert_called_once_with(
|
||||
'-k', '--api-key',
|
||||
default=None, # default_api_key is None in test environment
|
||||
help='Mistral API Key'
|
||||
)
|
||||
assert mock_parser.add_argument.call_count == 3
|
||||
# Check the API key arg is among them
|
||||
call_args_list = [c[0] for c in mock_parser.add_argument.call_args_list]
|
||||
assert ('-k', '--api-key') in call_args_list
|
||||
|
||||
@patch('trustgraph.decoding.mistral_ocr.processor.Processor.launch')
|
||||
def test_run(self, mock_launch):
|
||||
"""Test run function"""
|
||||
# Act
|
||||
from trustgraph.decoding.mistral_ocr.processor import run
|
||||
run()
|
||||
|
||||
# Assert
|
||||
mock_launch.assert_called_once_with("pdf-decoder",
|
||||
"\nSimple decoder, accepts PDF documents on input, outputs pages from the\nPDF document as text as separate output objects.\n")
|
||||
|
||||
mock_launch.assert_called_once()
|
||||
args = mock_launch.call_args[0]
|
||||
assert args[0] == "pdf-decoder"
|
||||
assert "Mistral OCR decoder" in args[1]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
|||
|
|
@ -171,8 +171,8 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
|
|||
|
||||
mock_output_flow.send.assert_called_once()
|
||||
call_args = mock_output_flow.send.call_args[0][0]
|
||||
# PDF decoder now forwards document_id, chunker fetches content from librarian
|
||||
assert call_args.document_id == "test-doc/p1"
|
||||
# PDF decoder now forwards document_id with UUID-based URN
|
||||
assert call_args.document_id.startswith("urn:page:")
|
||||
assert call_args.text == b"" # Content stored in librarian, not inline
|
||||
|
||||
@patch('trustgraph.base.flow_processor.FlowProcessor.add_args')
|
||||
|
|
|
|||
|
|
@ -10,8 +10,7 @@ from trustgraph.provenance.uris import (
|
|||
_encode_id,
|
||||
document_uri,
|
||||
page_uri,
|
||||
chunk_uri_from_page,
|
||||
chunk_uri_from_doc,
|
||||
chunk_uri,
|
||||
activity_uri,
|
||||
subgraph_uri,
|
||||
agent_uri,
|
||||
|
|
@ -60,31 +59,22 @@ class TestDocumentUris:
|
|||
assert document_uri(iri) == iri
|
||||
|
||||
def test_page_uri_format(self):
|
||||
result = page_uri("https://example.com/doc/123", 5)
|
||||
assert result == "https://example.com/doc/123/p5"
|
||||
result = page_uri()
|
||||
assert result.startswith("urn:page:")
|
||||
|
||||
def test_page_uri_page_zero(self):
|
||||
result = page_uri("https://example.com/doc/123", 0)
|
||||
assert result == "https://example.com/doc/123/p0"
|
||||
def test_page_uri_unique(self):
|
||||
r1 = page_uri()
|
||||
r2 = page_uri()
|
||||
assert r1 != r2
|
||||
|
||||
def test_chunk_uri_from_page_format(self):
|
||||
result = chunk_uri_from_page("https://example.com/doc/123", 2, 3)
|
||||
assert result == "https://example.com/doc/123/p2/c3"
|
||||
def test_chunk_uri_format(self):
|
||||
result = chunk_uri()
|
||||
assert result.startswith("urn:chunk:")
|
||||
|
||||
def test_chunk_uri_from_doc_format(self):
|
||||
result = chunk_uri_from_doc("https://example.com/doc/123", 7)
|
||||
assert result == "https://example.com/doc/123/c7"
|
||||
|
||||
def test_page_uri_preserves_doc_iri(self):
|
||||
doc = "urn:isbn:978-3-16-148410-0"
|
||||
result = page_uri(doc, 1)
|
||||
assert result.startswith(doc)
|
||||
|
||||
def test_chunk_from_page_hierarchy(self):
|
||||
"""Chunk URI should contain both page and chunk identifiers."""
|
||||
result = chunk_uri_from_page("https://example.com/doc", 3, 5)
|
||||
assert "/p3/" in result
|
||||
assert result.endswith("/c5")
|
||||
def test_chunk_uri_unique(self):
|
||||
r1 = chunk_uri()
|
||||
r2 = chunk_uri()
|
||||
assert r1 != r2
|
||||
|
||||
|
||||
class TestActivityAndSubgraphUris:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue