RabbitMQ pub/sub backend with topic exchange architecture (#752)

Adds a RabbitMQ backend as an alternative to Pulsar, selectable via PUBSUB_BACKEND=rabbitmq. Both backends implement the same PubSubBackend protocol — no application code changes needed to switch. RabbitMQ topology: - Single topic exchange per topicspace (e.g. 'tg') - Routing key derived from queue class and topic name - Shared consumers: named queue bound to exchange (competing, round-robin) - Exclusive consumers: anonymous auto-delete queue (broadcast, each gets every message). Used by Subscriber and config push consumer. - Thread-local producer connections (pika is not thread-safe) - Push-based consumption via basic_consume with process_data_events for heartbeat processing Consumer model changes: - Consumer class creates one backend consumer per concurrent task (required for pika thread safety, harmless for Pulsar) - Consumer class accepts consumer_type parameter - Subscriber passes consumer_type='exclusive' for broadcast semantics - Config push consumer uses consumer_type='exclusive' so every processor instance receives config updates - handle_one_from_queue receives consumer as parameter for correct per-connection ack/nack LibrarianClient: - New shared client class replacing duplicated librarian request-response code across 6+ services (chunking, decoders, RAG, etc.) - Uses stream-document instead of get-document-content for fetching document content in 1MB chunks (avoids broker message size limits) - Standalone object (self.librarian = LibrarianClient(...)) not a mixin - get-document-content marked deprecated in schema and OpenAPI spec Serialisation: - Extracted dataclass_to_dict/dict_to_dataclass to shared serialization.py (used by both Pulsar and RabbitMQ backends) Librarian queues: - Changed from flow class (persistent) back to request/response class now that stream-document eliminates large single messages - API upload chunk size reduced from 5MB to 3MB to stay under broker limits after base64 encoding Factory and CLI: - get_pubsub() handles 'rabbitmq' backend with RabbitMQ connection params - add_pubsub_args() includes RabbitMQ options (host, port, credentials) - add_pubsub_args(standalone=True) defaults to localhost for CLI tools - init_trustgraph skips Pulsar admin setup for non-Pulsar backends - tg-dump-queues and tg-monitor-prompts use backend abstraction - BaseClient and ConfigClient accept generic pubsub config
2026-04-25 08:26:21 +02:00 · 2026-04-02 12:47:16 +01:00 · 2026-04-02 12:47:16 +01:00 · 24f0190ce7
commit 24f0190ce7
parent 4fb0b4d8e8
36 changed files with 1277 additions and 1313 deletions
--- a/tests/unit/test_decoding/test_pdf_decoder.py
+++ b/tests/unit/test_decoding/test_pdf_decoder.py
@ -24,12 +24,10 @@ class MockAsyncProcessor:
 class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
    """Test PDF decoder processor functionality"""

-    @patch('trustgraph.base.chunking_service.Consumer')
-    @patch('trustgraph.base.chunking_service.Producer')
-    @patch('trustgraph.decoding.pdf.pdf_decoder.Consumer')
-    @patch('trustgraph.decoding.pdf.pdf_decoder.Producer')
+    @patch('trustgraph.base.librarian_client.Consumer')
+    @patch('trustgraph.base.librarian_client.Producer')
    @patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
-    async def test_processor_initialization(self, mock_producer, mock_consumer, mock_cs_producer, mock_cs_consumer):
+    async def test_processor_initialization(self, mock_producer, mock_consumer):
        """Test PDF decoder processor initialization"""
        config = {
            'id': 'test-pdf-decoder',
@ -44,13 +42,11 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
        assert consumer_specs[0].name == "input"
        assert consumer_specs[0].schema == Document

-    @patch('trustgraph.base.chunking_service.Consumer')
-    @patch('trustgraph.base.chunking_service.Producer')
-    @patch('trustgraph.decoding.pdf.pdf_decoder.Consumer')
-    @patch('trustgraph.decoding.pdf.pdf_decoder.Producer')
+    @patch('trustgraph.base.librarian_client.Consumer')
+    @patch('trustgraph.base.librarian_client.Producer')
    @patch('trustgraph.decoding.pdf.pdf_decoder.PyPDFLoader')
    @patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
-    async def test_on_message_success(self, mock_pdf_loader_class, mock_producer, mock_consumer, mock_cs_producer, mock_cs_consumer):
+    async def test_on_message_success(self, mock_pdf_loader_class, mock_producer, mock_consumer):
        """Test successful PDF processing"""
        # Mock PDF content
        pdf_content = b"fake pdf content"
@ -85,7 +81,7 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
        processor = Processor(**config)

        # Mock save_child_document to avoid waiting for librarian response
-        processor.save_child_document = AsyncMock(return_value="mock-doc-id")
+        processor.librarian.save_child_document = AsyncMock(return_value="mock-doc-id")

        await processor.on_message(mock_msg, None, mock_flow)

@ -94,13 +90,11 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
        # Verify triples were sent for each page (provenance)
        assert mock_triples_flow.send.call_count == 2

-    @patch('trustgraph.base.chunking_service.Consumer')
-    @patch('trustgraph.base.chunking_service.Producer')
-    @patch('trustgraph.decoding.pdf.pdf_decoder.Consumer')
-    @patch('trustgraph.decoding.pdf.pdf_decoder.Producer')
+    @patch('trustgraph.base.librarian_client.Consumer')
+    @patch('trustgraph.base.librarian_client.Producer')
    @patch('trustgraph.decoding.pdf.pdf_decoder.PyPDFLoader')
    @patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
-    async def test_on_message_empty_pdf(self, mock_pdf_loader_class, mock_producer, mock_consumer, mock_cs_producer, mock_cs_consumer):
+    async def test_on_message_empty_pdf(self, mock_pdf_loader_class, mock_producer, mock_consumer):
        """Test handling of empty PDF"""
        pdf_content = b"fake pdf content"
        pdf_base64 = base64.b64encode(pdf_content).decode('utf-8')
@ -128,13 +122,11 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):

        mock_output_flow.send.assert_not_called()

-    @patch('trustgraph.base.chunking_service.Consumer')
-    @patch('trustgraph.base.chunking_service.Producer')
-    @patch('trustgraph.decoding.pdf.pdf_decoder.Consumer')
-    @patch('trustgraph.decoding.pdf.pdf_decoder.Producer')
+    @patch('trustgraph.base.librarian_client.Consumer')
+    @patch('trustgraph.base.librarian_client.Producer')
    @patch('trustgraph.decoding.pdf.pdf_decoder.PyPDFLoader')
    @patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
-    async def test_on_message_unicode_content(self, mock_pdf_loader_class, mock_producer, mock_consumer, mock_cs_producer, mock_cs_consumer):
+    async def test_on_message_unicode_content(self, mock_pdf_loader_class, mock_producer, mock_consumer):
        """Test handling of unicode content in PDF"""
        pdf_content = b"fake pdf content"
        pdf_base64 = base64.b64encode(pdf_content).decode('utf-8')
@ -165,7 +157,7 @@ class TestPdfDecoderProcessor(IsolatedAsyncioTestCase):
        processor = Processor(**config)

        # Mock save_child_document to avoid waiting for librarian response
-        processor.save_child_document = AsyncMock(return_value="mock-doc-id")
+        processor.librarian.save_child_document = AsyncMock(return_value="mock-doc-id")

        await processor.on_message(mock_msg, None, mock_flow)