fix: reject invalid PDF decoder input (#977)

2026-06-17 02:45:14 +02:00 · 2026-06-09 11:37:10 -04:00 · 2026-06-09 11:37:10 -04:00 · 79d7ef6a90
commit 79d7ef6a90
parent e1c9351454
2 changed files with 79 additions and 29 deletions
--- a/tests/unit/test_decoding/test_pdf_decoder.py
+++ b/tests/unit/test_decoding/test_pdf_decoder.py
@ -49,7 +49,7 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
    async def test_on_message_success(self, mock_pdf_loader_class, mock_producer, mock_consumer):
        """Test successful PDF processing"""
        # Mock PDF content
-        pdf_content = b"fake pdf content"
+        pdf_content = b"%PDF-1.7\nfake pdf content"
        pdf_base64 = base64.b64encode(pdf_content).decode('utf-8')
        # Mock PyPDFLoader
@ -88,13 +88,55 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
        # Verify triples were sent for each page (provenance)
        assert mock_triples_flow.send.call_count == 2
    @patch('trustgraph.base.librarian_client.Consumer')
    @patch('trustgraph.base.librarian_client.Producer')
    @patch('trustgraph.decoding.pdf.pdf_decoder.PyPDFLoader')
    @patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
    async def test_on_message_rejects_librarian_content_that_is_not_pdf(self, mock_pdf_loader_class, mock_producer, mock_consumer):
        """Test rejecting non-PDF content before invoking the PDF loader"""
        html_content = b"<html><body>Not found</body></html>"
        html_base64 = base64.b64encode(html_content)
        mock_metadata = Metadata(id="test-doc")
        mock_document = Document(metadata=mock_metadata, document_id="doc-123")
        mock_msg = MagicMock()
        mock_msg.value.return_value = mock_document
        mock_output_flow = AsyncMock()
        mock_triples_flow = AsyncMock()
        mock_flow = MagicMock(side_effect=lambda name: {
            "output": mock_output_flow,
            "triples": mock_triples_flow,
        }.get(name))
        mock_flow.librarian.fetch_document_metadata = AsyncMock(
            return_value=MagicMock(kind="application/pdf")
        )
        mock_flow.librarian.fetch_document_content = AsyncMock(
            return_value=html_base64
        )
        mock_flow.librarian.save_child_document = AsyncMock()
        config = {
            'id': 'test-pdf-decoder',
            'taskgroup': AsyncMock()
        }
        processor = Processor(**config)
        await processor.on_message(mock_msg, None, mock_flow)
        mock_pdf_loader_class.assert_not_called()
        mock_output_flow.send.assert_not_called()
        mock_triples_flow.send.assert_not_called()
        mock_flow.librarian.save_child_document.assert_not_called()
    @patch('trustgraph.base.librarian_client.Consumer')
    @patch('trustgraph.base.librarian_client.Producer')
    @patch('trustgraph.decoding.pdf.pdf_decoder.PyPDFLoader')
    @patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
    async def test_on_message_empty_pdf(self, mock_pdf_loader_class, mock_producer, mock_consumer):
        """Test handling of empty PDF"""
-        pdf_content = b"fake pdf content"
+        pdf_content = b"%PDF-1.7\nfake pdf content"
        pdf_base64 = base64.b64encode(pdf_content).decode('utf-8')
        mock_loader = MagicMock()
@ -126,7 +168,7 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
    @patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
    async def test_on_message_unicode_content(self, mock_pdf_loader_class, mock_producer, mock_consumer):
        """Test handling of unicode content in PDF"""
-        pdf_content = b"fake pdf content"
+        pdf_content = b"%PDF-1.7\nfake pdf content"
        pdf_base64 = base64.b64encode(pdf_content).decode('utf-8')
        mock_loader = MagicMock()
--- a/trustgraph-flow/trustgraph/decoding/pdf/pdf_decoder.py
+++ b/trustgraph-flow/trustgraph/decoding/pdf/pdf_decoder.py
@ -32,6 +32,10 @@ logger = logging.getLogger(__name__)
 default_ident = "document-decoder"
 def _looks_like_pdf(content):
    return content.lstrip().startswith(b"%PDF-")
 class Processor(FlowProcessor):
    def __init__(self, **params):
@ -94,33 +98,37 @@ class Processor(FlowProcessor):
                )
                return
-        with tempfile.NamedTemporaryFile(delete_on_close=False, suffix='.pdf') as fp:
+        # Check if we should fetch from librarian or use inline data
        if v.document_id:
            # Fetch from librarian via Pulsar
            logger.info(f"Fetching document {v.document_id} from librarian...")
            content = await flow.librarian.fetch_document_content(
                document_id=v.document_id,
            )
            # Content is base64 encoded
            if isinstance(content, str):
                content = content.encode('utf-8')
            decoded_content = base64.b64decode(content)
            logger.info(f"Fetched {len(decoded_content)} bytes from librarian")
        else:
            # Use inline data (backward compatibility)
            decoded_content = base64.b64decode(v.data)
        if not _looks_like_pdf(decoded_content):
            logger.error(
                f"Document {v.metadata.id} is not valid PDF content. "
                f"Ignoring document."
            )
            return
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as fp:
            temp_path = fp.name
-
+            fp.write(decoded_content)
-            # Check if we should fetch from librarian or use inline data
+            fp.close()
            if v.document_id:
                # Fetch from librarian via Pulsar
                logger.info(f"Fetching document {v.document_id} from librarian...")
                fp.close()
                content = await flow.librarian.fetch_document_content(
                    document_id=v.document_id,
                )
                # Content is base64 encoded
                if isinstance(content, str):
                    content = content.encode('utf-8')
                decoded_content = base64.b64decode(content)
                with open(temp_path, 'wb') as f:
                    f.write(decoded_content)
                logger.info(f"Fetched {len(decoded_content)} bytes from librarian")
            else:
                # Use inline data (backward compatibility)
                fp.write(base64.b64decode(v.data))
                fp.close()
            global PyPDFLoader
            if PyPDFLoader is None: