From 9f84891fccf3473eb4afe7432f98a1be0312402f Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Thu, 16 Apr 2026 17:19:39 +0100 Subject: [PATCH 1/7] Flow service lifecycle management (#822) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit feat: separate flow service from config service with explicit queue lifecycle management The flow service is now an independent service that owns the lifecycle of flow and blueprint queues. System services own their own queues. Consumers never create queues. Flow service separation: - New service at trustgraph-flow/trustgraph/flow/service/ - Uses async ConfigClient (RequestResponse pattern) to talk to config service - Config service stripped of all flow handling Queue lifecycle management: - PubSubBackend protocol gains create_queue, delete_queue, queue_exists, ensure_queue — all async - RabbitMQ: implements via pika with asyncio.to_thread internally - Pulsar: stubs for future admin REST API implementation - Consumer _connect() no longer creates queues (passive=True for named queues) - System services call ensure_queue on startup - Flow service creates queues on flow start, deletes on flow stop - Flow service ensures queues for pre-existing flows on startup Two-phase flow stop: - Phase 1: set flow status to "stopping", delete processor config entries - Phase 2: retry queue deletion, then delete flow record Config restructure: - active-flow config replaced with processor:{name} types - Each processor has its own config type, each flow variant is a key - Flow start/stop use batch put/delete — single config push per operation - FlowProcessor subscribes to its own type only Blueprint format: - Processor entries split into topics and parameters dicts - Flow interfaces use {"flow": "topic"} instead of bare strings - Specs (ConsumerSpec, ProducerSpec, etc.) read from definition["topics"] Tests updated --- .../tech-specs/active-flow-key-restructure.md | 67 ++++ .../flow-service-queue-lifecycle.md | 299 ++++++++++++++++++ .../unit/test_base/test_flow_base_modules.py | 2 +- tests/unit/test_base/test_flow_processor.py | 264 ++++++---------- .../unit/test_cores/test_knowledge_manager.py | 4 +- .../unit/test_gateway/test_config_receiver.py | 2 +- .../test_gateway/test_dispatch_manager.py | 14 +- trustgraph-base/trustgraph/base/__init__.py | 3 +- trustgraph-base/trustgraph/base/backend.py | 56 ++++ .../trustgraph/base/config_client.py | 92 ++++++ .../trustgraph/base/consumer_spec.py | 2 +- .../trustgraph/base/flow_processor.py | 21 +- .../trustgraph/base/parameter_spec.py | 2 +- .../trustgraph/base/producer_spec.py | 2 +- .../trustgraph/base/pulsar_backend.py | 20 ++ .../trustgraph/base/rabbitmq_backend.py | 159 +++++++++- .../trustgraph/base/request_response_spec.py | 4 +- .../trustgraph/base/subscriber_spec.py | 2 +- trustgraph-cli/trustgraph/cli/show_flows.py | 2 +- trustgraph-flow/pyproject.toml | 1 + .../trustgraph/config/service/service.py | 95 +----- trustgraph-flow/trustgraph/cores/knowledge.py | 4 +- trustgraph-flow/trustgraph/cores/service.py | 6 + trustgraph-flow/trustgraph/flow/__init__.py | 2 + .../trustgraph/flow/service/__init__.py | 2 + .../trustgraph/flow/service/__main__.py | 6 + .../{config => flow}/service/flow.py | 279 +++++++++++----- .../trustgraph/flow/service/service.py | 162 ++++++++++ .../trustgraph/gateway/config/receiver.py | 2 +- .../trustgraph/gateway/dispatch/manager.py | 6 +- .../trustgraph/librarian/service.py | 18 +- 31 files changed, 1202 insertions(+), 398 deletions(-) create mode 100644 docs/tech-specs/active-flow-key-restructure.md create mode 100644 docs/tech-specs/flow-service-queue-lifecycle.md create mode 100644 trustgraph-base/trustgraph/base/config_client.py create mode 100644 trustgraph-flow/trustgraph/flow/__init__.py create mode 100644 trustgraph-flow/trustgraph/flow/service/__init__.py create mode 100644 trustgraph-flow/trustgraph/flow/service/__main__.py rename trustgraph-flow/trustgraph/{config => flow}/service/flow.py (57%) create mode 100644 trustgraph-flow/trustgraph/flow/service/service.py diff --git a/docs/tech-specs/active-flow-key-restructure.md b/docs/tech-specs/active-flow-key-restructure.md new file mode 100644 index 00000000..754e7953 --- /dev/null +++ b/docs/tech-specs/active-flow-key-restructure.md @@ -0,0 +1,67 @@ +--- +layout: default +title: "Active-Flow Key Restructure" +parent: "Tech Specs" +--- + +# Active-Flow Key Restructure + +## Problem + +Active-flow config uses `('active-flow', processor)` as its key, where +each processor's value is a JSON blob containing all flow variants +assigned to that processor: + +``` +('active-flow', 'chunker') -> { "default": {...}, "flow2": {...} } +``` + +This causes two problems: + +1. **Read-modify-write on every change.** Starting or stopping a flow + requires fetching the processor's current blob, parsing it, adding + or removing a variant, serialising it, and writing it back. This is + a concurrency hazard if two flow operations target the same + processor simultaneously. + +2. **Noisy config pushes.** Config subscribers subscribe to a type, + not a specific key. Every active-flow write triggers a config push + that causes every processor in the system to fetch the full config + and re-evaluate, even though only one processor's config changed. + With N processors in a blueprint, a single flow start/stop causes + N writes and N^2 config fetches across the system. + +## Proposed Change + +Restructure the key to `('active-flow', 'processor:variant')` where +each key holds a single flow variant's configuration: + +``` +('active-flow', 'chunker:default') -> { "topics": {...}, "parameters": {...} } +('active-flow', 'chunker:flow2') -> { "topics": {...}, "parameters": {...} } +``` + +Starting a flow is a set of clean puts. Stopping a flow is a set of +clean deletes. No read-modify-write. No JSON blob merging. + +The config push problem (all processors fetching on every change) +remains — that's a limitation of the config subscription model and +would require per-key subscriptions to solve. But eliminating the +read-modify-write removes the concurrency hazard and simplifies the +flow service code. + +## What Changes + +- **Flow service** (`flow.py`): `handle_start_flow` writes individual + keys per processor:variant instead of merging into per-processor + blobs. `handle_stop_flow` deletes individual keys instead of + read-modify-write. +- **FlowProcessor** (`flow_processor.py`): `on_configure_flows` + currently looks up `config["active-flow"][self.id]` to find a JSON + blob of all its variants. Needs to scan all active-flow keys for + entries prefixed with `self.id:` and assemble its flow list from + those. +- **Config client**: May benefit from a prefix-scan or pattern-match + query to support the FlowProcessor lookup efficiently. +- **Initial config / bootstrapping**: Any code that seeds active-flow + entries at deployment time needs to use the new key format. diff --git a/docs/tech-specs/flow-service-queue-lifecycle.md b/docs/tech-specs/flow-service-queue-lifecycle.md new file mode 100644 index 00000000..a724a2ef --- /dev/null +++ b/docs/tech-specs/flow-service-queue-lifecycle.md @@ -0,0 +1,299 @@ +--- +layout: default +title: "Flow Service Separation and Queue Lifecycle Management" +parent: "Tech Specs" +--- + +# Flow Service Separation and Queue Lifecycle Management + +## Overview + +This specification describes the separation of the flow service from the +config service into an independent service, and the addition of explicit +queue lifecycle management to the pub/sub backend abstraction. + +Every queue in the system has an explicit owner responsible for its +creation and deletion: + +- **Flow and blueprint queues** — owned by the flow service +- **System queues** (config, librarian, knowledge, etc.) — owned by + the services themselves + +Consumers never create queues. They connect to queues that already +exist. + +This addresses a fundamental problem across broker backends: without an +authoritative lifecycle owner, queues are created as a side effect of +consumer connections and never explicitly deleted. In RabbitMQ, this +leads to orphaned durable queues. In Pulsar, persistent topics and +subscriptions survive consumer crashes. In Kafka, topics persist +indefinitely. The solution is the same for all backends: explicit +lifecycle management through the `PubSubBackend` protocol. + +--- + +## Background + +### Current Architecture + +The flow service (`FlowConfig`) and config service (`Configuration`) +are co-located in a single process: `trustgraph-flow/config/service`. +They share a `Processor` class that inherits from `AsyncProcessor` and +manages both config and flow request/response queues. `FlowConfig` +receives a direct reference to the `Configuration` object, giving it +backdoor access to `inc_version()` and `push()` — methods that bypass +the config service's own pub/sub interface. + +The flow service manages flow lifecycle (start/stop) by manipulating +config state — active-flow entries, flow records, blueprint lookups — +but takes no active part in broker queue management. Queues are created +implicitly when the first consumer connects and are never explicitly +deleted. + +### The Queue Lifecycle Problem + +Queues are currently created as a side effect of consumer connections +(in `_connect()` for RabbitMQ, in `subscribe()` for Pulsar). No single +component owns queue lifecycle, leading to two failure modes: + +- **Orphaned queues**: When a flow is stopped, consumers shut down but + their queues remain — along with any messages in them. In RabbitMQ, + durable queues persist indefinitely. In Pulsar, persistent topics and + their subscriptions survive consumer disconnection unless + `unsubscribe()` is explicitly called — which doesn't happen on crash + or error paths. +- **Premature deletion**: If consumers attempt to delete queues on + exit, error-path shutdowns destroy queues that other consumers or the + system still need. + +Neither strategy is reliable because neither the consumer nor the +broker knows whether a queue should exist — only the flow manager +knows that. + +### Why Separation + +The flow service currently piggybacks on the config service process. +Adding broker queue management to the flow service introduces operations +that may have significant latency — RabbitMQ queue operations are +generally fast, but Kafka topic creation can involve partition +assignment, replication, and leader election delays. + +The config service is on the critical path for every service in the +system — all services read configuration on startup and respond to +config pushes. Blocking the config service's request/response loop +while waiting for broker operations risks cascading latency across the +entire deployment. + +Separating the flow service into its own process gives it an +independent latency budget. A slow `start-flow` (waiting for queue +creation across multiple brokers) does not affect config reads. +Additionally, the flow service's direct access to the `Configuration` +object is a coupling that masks what should be a clean client +relationship — the flow service only needs to read and write config +entries, which is exactly what the existing config client provides. + +--- + +## Design + +### Queue Ownership Model + +Every queue in the system has exactly one owner responsible for its +creation and deletion: + +| Queue type | Owner | Created when | Deleted when | +|---|---|---|---| +| Flow queues | Flow service | `start-flow` | `stop-flow` | +| Blueprint queues | Flow service | `start-flow` (idempotent) | Never (shared across flow instances) | +| System queues (config, librarian, knowledge, etc.) | Each service | Service startup | Never (system lifetime) | + +Consumers never create queues. The consumer's `_connect()` method +connects to a queue that must already exist — it does not declare or +create it. + +### Flow Service as Independent Service + +The flow service becomes its own `Processor(AsyncProcessor)` in a +separate module and process. It: + +- Listens on the existing flow request/response queues (already distinct + from config queues — no consumer migration needed). +- Uses the async `ConfigClient` (extending `RequestResponse`) to + read/write config state (blueprints, active-flow entries, flow + records). Config pushes are triggered automatically by config + writes — the backdoor `inc_version()` and `push()` calls are no + longer needed. +- Has direct access to the pub/sub backend (inherited from + `AsyncProcessor`) for queue lifecycle operations. + +The config service (`trustgraph-flow/config/service`) is simplified: +the flow consumer, flow producer, and `FlowConfig` class are removed. +It returns to being purely a config service. + +### Queue Lifecycle in the Pub/Sub Backend + +The `PubSubBackend` protocol gains queue management methods. All new +methods are async — backends that use blocking I/O (e.g., pika for +RabbitMQ) handle threading internally. + +``` +PubSubBackend: + create_producer(...) # existing + create_consumer(...) # existing + close() # existing + async create_queue(topic, subscription) # new + async delete_queue(topic, subscription) # new + async queue_exists(topic, subscription) # new + async ensure_queue(topic, subscription) # new +``` + +- `create_queue` — create a queue. Idempotent if queue already exists + with the same properties. Fails if properties mismatch. +- `delete_queue` — delete a queue and its messages. Idempotent if + queue does not exist. +- `queue_exists` — check whether a queue exists. Returns bool. +- `ensure_queue` — create-if-not-exists convenience wrapper. + +The `topic` and `subscription` parameters together identify the queue, +mirroring `create_consumer` where the queue name is derived from both. + +Backend implementations: + +- **RabbitMQ**: `queue_declare`, `queue_delete`, and + `queue_declare(passive=True)` via pika. Blocking calls wrapped in + `asyncio.to_thread` inside the backend. Queue name derived using the + existing `_parse_queue_id` logic. +- **Pulsar**: REST calls to the Pulsar admin API (port 8080). + Create/delete persistent topics, delete subscriptions. Requires admin + URL as additional configuration alongside the broker URL. +- **Kafka** (future): `AdminClient.create_topics()` and + `AdminClient.delete_topics()` from the `confluent-kafka` library. + Uses the same bootstrap servers as the broker connection. + +### Flow Start: Queue Creation + +When `handle_start_flow` processes a flow start request, after +resolving parameters and computing the template-substituted topic +identifiers, it creates queues before writing config state. + +Queues are created for both `cls["blueprint"]` and `cls["flow"]` +entries. Blueprint queue creation is idempotent — multiple flows +creating the same blueprint queue is safe. + +The flow start request returns only after queues are confirmed ready. +This gives callers a hard guarantee: when `start-flow` succeeds, the +data path is fully wired. + +### Flow Stop: Two-Phase Shutdown + +Stopping a flow is a two-phase transaction with a retry window between +them. + +**Phase 1 — Signal processors to shut down:** + +1. Set the flow record's status to `"stopping"`. This marks the flow + as in-progress so that if the flow service crashes mid-stop, it can + identify and resume incomplete shutdowns on restart. +2. Remove the flow's variants from each processor's `active-flow` + config entries. +3. Config push fires automatically. Each `FlowProcessor` receives the + update, compares wanted vs current flows, and calls `stop_flow` on + flows no longer wanted — closing consumers and producers. + +**Phase 2 — Delete queues with retries, then finalise:** + +1. Retry queue deletion with delays, giving processors time to react + to the config change and disconnect. Queue deletion is idempotent — + if a queue was already removed by a previous attempt or was never + created, the operation succeeds silently. Only `cls["flow"]` entries + (per-flow-instance queues) are deleted — `cls["blueprint"]` entries + are shared infrastructure and are not touched. +2. Delete the `flow` record from config. + +The flow service retries persistently. A queue that cannot be deleted +after retries is logged as an error but does not block the stop +transaction from completing — a leaked queue is less harmful than a +flow that cannot be stopped. + +**Crash recovery:** On startup, the flow service scans for flow +records with `"status": "stopping"`. These represent incomplete +shutdowns from a previous run. For each, it resumes from the +appropriate point — if active-flow entries are already cleared, it +proceeds directly to phase 2 (queue deletion and flow record cleanup). + +### System Service Queues + +System services (config, librarian, knowledge, etc.) are not managed +by the flow service. Each service calls `ensure_queue` for its own +queues during startup. These queues persist for the lifetime of the +system and are never explicitly deleted. + +### Consumer Connection + +Consumers never create queues. The consumer connects to a queue that +must already exist — created either by the flow service (for flow and +blueprint queues) or by the service itself (for system queues). + +For RabbitMQ, this means `_connect()` no longer calls `queue_declare`. +It connects to a queue by name and fails if the queue does not exist. + +For Pulsar, `subscribe()` inherently creates a subscription. This is +how Pulsar works and does not conflict with the lifecycle model — +Pulsar's broker manages subscription state, and the flow service uses +the admin API for explicit cleanup. + +--- + +## Operational Impact + +### Deployment + +The flow service is a new container/process alongside the existing +config service. It requires: + +- Access to the message broker (same credentials as other services — + inherited from `AsyncProcessor` via standard CLI args). +- Access to the config service via pub/sub (config request/response + queues — same as any other service that reads config). +- For Pulsar: the admin API URL (separate from the broker URL). + +It does not require direct Cassandra access. + +### Backward Compatibility + +- The flow request/response queue interface is unchanged — API gateway + and CLI tools that send flow requests continue to work without + modification. +- The config service loses its flow handling capability, so both + services must be deployed together. This is a breaking change in + deployment topology but not in API. +- Queue deletion on flow stop is new behaviour. Existing deployments + that rely on queues persisting after flow stop (e.g. for post-mortem + message inspection) would need to drain queues before stopping flows. + +--- + +## Assumptions + +- **The flow service is the sole writer of flow configuration.** The + two-phase stop transaction relies on the flow record's `"stopping"` + status being authoritative — no other service or process modifies + flow records, active-flow entries, or flow blueprints. This is true + today (only `FlowConfig` writes to these config keys) and must remain + true after separation. The config service provides the storage, but + the flow service owns the semantics. + +--- + +## Design Decisions + +| Decision | Resolution | Rationale | +|---|---|---| +| Queue ownership | Every queue has exactly one explicit owner | Eliminates implicit creation, makes lifecycle auditable | +| Queue deletion strategy | Retry persistently, don't block stop | A leaked queue is less harmful than a flow stuck in stopping state | +| Purge without delete | Not needed | Flows are fully dynamic — stop and restart recreates everything | +| Blueprint-level queues | Created on flow start (idempotent), never deleted | Shared across flow instances; creation is safe, deletion would break other flows | +| Flow stop atomicity | Two-phase with `"stopping"` state | Allows crash recovery; flow service can resume incomplete shutdowns | +| Backend protocol methods | All async | Backends hide blocking I/O internally; callers never deal with threading | +| Pulsar lifecycle | REST admin API, not no-op | Persistent topics and subscriptions survive crashes; explicit cleanup needed | +| Consumer queue creation | Consumers never create queues | Single ownership; `_connect()` connects to existing queues only | diff --git a/tests/unit/test_base/test_flow_base_modules.py b/tests/unit/test_base/test_flow_base_modules.py index 33a3c4c2..5bbd7a18 100644 --- a/tests/unit/test_base/test_flow_base_modules.py +++ b/tests/unit/test_base/test_flow_base_modules.py @@ -11,7 +11,7 @@ def test_parameter_spec_is_a_spec_and_adds_parameter_value(): flow = MagicMock(parameter={}) processor = MagicMock() - spec.add(flow, processor, {"temperature": 0.7}) + spec.add(flow, processor, {"parameters": {"temperature": 0.7}}) assert isinstance(spec, Spec) assert "temperature" in flow.parameter diff --git a/tests/unit/test_base/test_flow_processor.py b/tests/unit/test_base/test_flow_processor.py index 8672831a..36a05ec2 100644 --- a/tests/unit/test_base/test_flow_processor.py +++ b/tests/unit/test_base/test_flow_processor.py @@ -1,58 +1,50 @@ """ Unit tests for trustgraph.base.flow_processor -Starting small with a single test to verify basic functionality """ import pytest from unittest.mock import AsyncMock, MagicMock, patch from unittest import IsolatedAsyncioTestCase -# Import the service under test from trustgraph.base.flow_processor import FlowProcessor +# Patches needed to let AsyncProcessor.__init__ run without real +# infrastructure while still setting self.id correctly. +ASYNC_PROCESSOR_PATCHES = [ + patch('trustgraph.base.async_processor.get_pubsub', return_value=MagicMock()), + patch('trustgraph.base.async_processor.ProcessorMetrics', return_value=MagicMock()), + patch('trustgraph.base.async_processor.Consumer', return_value=MagicMock()), +] + + +def with_async_processor_patches(func): + """Apply all AsyncProcessor dependency patches to a test.""" + for p in reversed(ASYNC_PROCESSOR_PATCHES): + func = p(func) + return func + + class TestFlowProcessorSimple(IsolatedAsyncioTestCase): """Test FlowProcessor base class functionality""" - @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') - @patch('trustgraph.base.async_processor.AsyncProcessor.register_config_handler') - async def test_flow_processor_initialization_basic(self, mock_register_config, mock_async_init): + @with_async_processor_patches + async def test_flow_processor_initialization_basic(self, *mocks): """Test basic FlowProcessor initialization""" - # Arrange - mock_async_init.return_value = None - mock_register_config.return_value = None - config = { 'id': 'test-flow-processor', 'taskgroup': AsyncMock() } - # Act processor = FlowProcessor(**config) - # Assert - # Verify AsyncProcessor.__init__ was called - mock_async_init.assert_called_once() - - # Verify register_config_handler was called with the correct handler - mock_register_config.assert_called_once_with( - processor.on_configure_flows, types=["active-flow"] - ) - - # Verify FlowProcessor-specific initialization - assert hasattr(processor, 'flows') + assert processor.id == 'test-flow-processor' assert processor.flows == {} - assert hasattr(processor, 'specifications') assert processor.specifications == [] - @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') - @patch('trustgraph.base.async_processor.AsyncProcessor.register_config_handler') - async def test_register_specification(self, mock_register_config, mock_async_init): + @with_async_processor_patches + async def test_register_specification(self, *mocks): """Test registering a specification""" - # Arrange - mock_async_init.return_value = None - mock_register_config.return_value = None - config = { 'id': 'test-flow-processor', 'taskgroup': AsyncMock() @@ -62,288 +54,210 @@ class TestFlowProcessorSimple(IsolatedAsyncioTestCase): mock_spec = MagicMock() mock_spec.name = 'test-spec' - # Act processor.register_specification(mock_spec) - # Assert assert len(processor.specifications) == 1 assert processor.specifications[0] == mock_spec @patch('trustgraph.base.flow_processor.Flow') - @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') - @patch('trustgraph.base.async_processor.AsyncProcessor.register_config_handler') - async def test_start_flow(self, mock_register_config, mock_async_init, mock_flow_class): + @with_async_processor_patches + async def test_start_flow(self, *mocks): """Test starting a flow""" - # Arrange - mock_async_init.return_value = None - mock_register_config.return_value = None - + mock_flow_class = mocks[-1] + config = { - 'id': 'test-flow-processor', + 'id': 'test-processor', 'taskgroup': AsyncMock() } processor = FlowProcessor(**config) - processor.id = 'test-processor' # Set id for Flow creation - + mock_flow = AsyncMock() mock_flow_class.return_value = mock_flow - + flow_name = 'test-flow' flow_defn = {'config': 'test-config'} - # Act await processor.start_flow(flow_name, flow_defn) - # Assert assert flow_name in processor.flows - # Verify Flow was created with correct parameters - mock_flow_class.assert_called_once_with('test-processor', flow_name, processor, flow_defn) - # Verify the flow's start method was called + mock_flow_class.assert_called_once_with( + 'test-processor', flow_name, processor, flow_defn + ) mock_flow.start.assert_called_once() @patch('trustgraph.base.flow_processor.Flow') - @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') - @patch('trustgraph.base.async_processor.AsyncProcessor.register_config_handler') - async def test_stop_flow(self, mock_register_config, mock_async_init, mock_flow_class): + @with_async_processor_patches + async def test_stop_flow(self, *mocks): """Test stopping a flow""" - # Arrange - mock_async_init.return_value = None - mock_register_config.return_value = None - + mock_flow_class = mocks[-1] + config = { - 'id': 'test-flow-processor', + 'id': 'test-processor', 'taskgroup': AsyncMock() } processor = FlowProcessor(**config) - processor.id = 'test-processor' - + mock_flow = AsyncMock() mock_flow_class.return_value = mock_flow - - flow_name = 'test-flow' - flow_defn = {'config': 'test-config'} - # Start a flow first - await processor.start_flow(flow_name, flow_defn) - - # Act + flow_name = 'test-flow' + await processor.start_flow(flow_name, {'config': 'test-config'}) + await processor.stop_flow(flow_name) - # Assert assert flow_name not in processor.flows mock_flow.stop.assert_called_once() - @patch('trustgraph.base.flow_processor.Flow') - @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') - @patch('trustgraph.base.async_processor.AsyncProcessor.register_config_handler') - async def test_stop_flow_not_exists(self, mock_register_config, mock_async_init, mock_flow_class): + @with_async_processor_patches + async def test_stop_flow_not_exists(self, *mocks): """Test stopping a flow that doesn't exist""" - # Arrange - mock_async_init.return_value = None - mock_register_config.return_value = None - config = { 'id': 'test-flow-processor', 'taskgroup': AsyncMock() } processor = FlowProcessor(**config) - - # Act - should not raise an exception + await processor.stop_flow('non-existent-flow') - # Assert - flows dict should still be empty assert processor.flows == {} @patch('trustgraph.base.flow_processor.Flow') - @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') - @patch('trustgraph.base.async_processor.AsyncProcessor.register_config_handler') - async def test_on_configure_flows_basic(self, mock_register_config, mock_async_init, mock_flow_class): + @with_async_processor_patches + async def test_on_configure_flows_basic(self, *mocks): """Test basic flow configuration handling""" - # Arrange - mock_async_init.return_value = None - mock_register_config.return_value = None - + mock_flow_class = mocks[-1] + config = { - 'id': 'test-flow-processor', + 'id': 'test-processor', 'taskgroup': AsyncMock() } processor = FlowProcessor(**config) - processor.id = 'test-processor' - + mock_flow = AsyncMock() mock_flow_class.return_value = mock_flow - - # Configuration with flows for this processor - flow_config = { - 'test-flow': {'config': 'test-config'} - } + config_data = { - 'active-flow': { - 'test-processor': '{"test-flow": {"config": "test-config"}}' + 'processor:test-processor': { + 'test-flow': '{"config": "test-config"}' } } - - # Act + await processor.on_configure_flows(config_data, version=1) - # Assert assert 'test-flow' in processor.flows - mock_flow_class.assert_called_once_with('test-processor', 'test-flow', processor, {'config': 'test-config'}) + mock_flow_class.assert_called_once_with( + 'test-processor', 'test-flow', processor, + {'config': 'test-config'} + ) mock_flow.start.assert_called_once() - @patch('trustgraph.base.flow_processor.Flow') - @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') - @patch('trustgraph.base.async_processor.AsyncProcessor.register_config_handler') - async def test_on_configure_flows_no_config(self, mock_register_config, mock_async_init, mock_flow_class): + @with_async_processor_patches + async def test_on_configure_flows_no_config(self, *mocks): """Test flow configuration handling when no config exists for this processor""" - # Arrange - mock_async_init.return_value = None - mock_register_config.return_value = None - config = { - 'id': 'test-flow-processor', + 'id': 'test-processor', 'taskgroup': AsyncMock() } processor = FlowProcessor(**config) - processor.id = 'test-processor' - - # Configuration without flows for this processor + config_data = { - 'active-flow': { - 'other-processor': '{"other-flow": {"config": "other-config"}}' + 'processor:other-processor': { + 'other-flow': '{"config": "other-config"}' } } - - # Act + await processor.on_configure_flows(config_data, version=1) - # Assert assert processor.flows == {} - mock_flow_class.assert_not_called() - @patch('trustgraph.base.flow_processor.Flow') - @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') - @patch('trustgraph.base.async_processor.AsyncProcessor.register_config_handler') - async def test_on_configure_flows_invalid_config(self, mock_register_config, mock_async_init, mock_flow_class): + @with_async_processor_patches + async def test_on_configure_flows_invalid_config(self, *mocks): """Test flow configuration handling with invalid config format""" - # Arrange - mock_async_init.return_value = None - mock_register_config.return_value = None - config = { - 'id': 'test-flow-processor', + 'id': 'test-processor', 'taskgroup': AsyncMock() } processor = FlowProcessor(**config) - processor.id = 'test-processor' - - # Configuration without active-flow key + config_data = { 'other-data': 'some-value' } - - # Act + await processor.on_configure_flows(config_data, version=1) - # Assert assert processor.flows == {} - mock_flow_class.assert_not_called() @patch('trustgraph.base.flow_processor.Flow') - @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') - @patch('trustgraph.base.async_processor.AsyncProcessor.register_config_handler') - async def test_on_configure_flows_start_and_stop(self, mock_register_config, mock_async_init, mock_flow_class): + @with_async_processor_patches + async def test_on_configure_flows_start_and_stop(self, *mocks): """Test flow configuration handling with starting and stopping flows""" - # Arrange - mock_async_init.return_value = None - mock_register_config.return_value = None - + mock_flow_class = mocks[-1] + config = { - 'id': 'test-flow-processor', + 'id': 'test-processor', 'taskgroup': AsyncMock() } processor = FlowProcessor(**config) - processor.id = 'test-processor' - + mock_flow1 = AsyncMock() mock_flow2 = AsyncMock() mock_flow_class.side_effect = [mock_flow1, mock_flow2] - - # First configuration - start flow1 + config_data1 = { - 'active-flow': { - 'test-processor': '{"flow1": {"config": "config1"}}' + 'processor:test-processor': { + 'flow1': '{"config": "config1"}' } } await processor.on_configure_flows(config_data1, version=1) - # Second configuration - stop flow1, start flow2 config_data2 = { - 'active-flow': { - 'test-processor': '{"flow2": {"config": "config2"}}' + 'processor:test-processor': { + 'flow2': '{"config": "config2"}' } } - - # Act + await processor.on_configure_flows(config_data2, version=2) - # Assert - # flow1 should be stopped and removed assert 'flow1' not in processor.flows mock_flow1.stop.assert_called_once() - - # flow2 should be started and added + assert 'flow2' in processor.flows mock_flow2.start.assert_called_once() - @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') - @patch('trustgraph.base.async_processor.AsyncProcessor.register_config_handler') + @with_async_processor_patches @patch('trustgraph.base.async_processor.AsyncProcessor.start') - async def test_start_calls_parent(self, mock_parent_start, mock_register_config, mock_async_init): + async def test_start_calls_parent(self, mock_parent_start, *mocks): """Test that start() calls parent start method""" - # Arrange - mock_async_init.return_value = None - mock_register_config.return_value = None mock_parent_start.return_value = None - + config = { 'id': 'test-flow-processor', 'taskgroup': AsyncMock() } processor = FlowProcessor(**config) - - # Act + await processor.start() - # Assert mock_parent_start.assert_called_once() - @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') - @patch('trustgraph.base.async_processor.AsyncProcessor.register_config_handler') - async def test_add_args_calls_parent(self, mock_register_config, mock_async_init): + async def test_add_args_calls_parent(self): """Test that add_args() calls parent add_args method""" - # Arrange - mock_async_init.return_value = None - mock_register_config.return_value = None - mock_parser = MagicMock() - - # Act + with patch('trustgraph.base.async_processor.AsyncProcessor.add_args') as mock_parent_add_args: FlowProcessor.add_args(mock_parser) - # Assert mock_parent_add_args.assert_called_once_with(mock_parser) if __name__ == '__main__': - pytest.main([__file__]) \ No newline at end of file + pytest.main([__file__]) diff --git a/tests/unit/test_cores/test_knowledge_manager.py b/tests/unit/test_cores/test_knowledge_manager.py index 76095690..80c27fe8 100644 --- a/tests/unit/test_cores/test_knowledge_manager.py +++ b/tests/unit/test_cores/test_knowledge_manager.py @@ -30,8 +30,8 @@ def mock_flow_config(): mock_config.flows = { "test-flow": { "interfaces": { - "triples-store": "test-triples-queue", - "graph-embeddings-store": "test-ge-queue" + "triples-store": {"flow": "test-triples-queue"}, + "graph-embeddings-store": {"flow": "test-ge-queue"} } } } diff --git a/tests/unit/test_gateway/test_config_receiver.py b/tests/unit/test_gateway/test_config_receiver.py index c2a149d5..90ba8d33 100644 --- a/tests/unit/test_gateway/test_config_receiver.py +++ b/tests/unit/test_gateway/test_config_receiver.py @@ -121,7 +121,7 @@ class TestConfigReceiver: fetch_calls.append(kwargs) config_receiver.fetch_and_apply = mock_fetch - for type_name in ["flow", "active-flow"]: + for type_name in ["flow"]: fetch_calls.clear() config_receiver.config_version = 1 diff --git a/tests/unit/test_gateway/test_dispatch_manager.py b/tests/unit/test_gateway/test_dispatch_manager.py index 83969fdd..4ebcb5b9 100644 --- a/tests/unit/test_gateway/test_dispatch_manager.py +++ b/tests/unit/test_gateway/test_dispatch_manager.py @@ -277,7 +277,7 @@ class TestDispatcherManager: # Setup test flow manager.flows["test_flow"] = { "interfaces": { - "triples-store": {"queue": "test_queue"} + "triples-store": {"flow": "test_queue"} } } @@ -298,7 +298,7 @@ class TestDispatcherManager: backend=mock_backend, ws="ws", running="running", - queue={"queue": "test_queue"} + queue="test_queue" ) mock_dispatcher.start.assert_called_once() assert result == mock_dispatcher @@ -328,7 +328,7 @@ class TestDispatcherManager: # Setup test flow manager.flows["test_flow"] = { "interfaces": { - "triples-store": {"queue": "test_queue"} + "triples-store": {"flow": "test_queue"} } } @@ -350,7 +350,7 @@ class TestDispatcherManager: # Setup test flow manager.flows["test_flow"] = { "interfaces": { - "triples-store": {"queue": "test_queue"} + "triples-store": {"flow": "test_queue"} } } @@ -370,7 +370,7 @@ class TestDispatcherManager: backend=mock_backend, ws="ws", running="running", - queue={"queue": "test_queue"}, + queue="test_queue", consumer="api-gateway-test-uuid", subscriber="api-gateway-test-uuid" ) @@ -481,7 +481,7 @@ class TestDispatcherManager: # Setup test flow manager.flows["test_flow"] = { "interfaces": { - "text-load": {"queue": "text_load_queue"} + "text-load": {"flow": "text_load_queue"} } } @@ -502,7 +502,7 @@ class TestDispatcherManager: # Verify dispatcher was created with correct parameters mock_dispatcher_class.assert_called_once_with( backend=mock_backend, - queue={"queue": "text_load_queue"} + queue="text_load_queue" ) mock_dispatcher.start.assert_called_once() mock_dispatcher.process.assert_called_once_with("data", "responder") diff --git a/trustgraph-base/trustgraph/base/__init__.py b/trustgraph-base/trustgraph/base/__init__.py index ce17a585..91622156 100644 --- a/trustgraph-base/trustgraph/base/__init__.py +++ b/trustgraph-base/trustgraph/base/__init__.py @@ -5,7 +5,7 @@ from . consumer import Consumer from . producer import Producer from . publisher import Publisher from . subscriber import Subscriber -from . metrics import ProcessorMetrics, ConsumerMetrics, ProducerMetrics +from . metrics import ProcessorMetrics, ConsumerMetrics, ProducerMetrics, SubscriberMetrics from . logging import add_logging_args, setup_logging from . flow_processor import FlowProcessor from . consumer_spec import ConsumerSpec @@ -22,6 +22,7 @@ from . text_completion_client import ( TextCompletionClientSpec, TextCompletionClient, TextCompletionResult, ) from . prompt_client import PromptClientSpec, PromptClient, PromptResult +from . config_client import ConfigClientSpec, ConfigClient from . triples_store_service import TriplesStoreService from . graph_embeddings_store_service import GraphEmbeddingsStoreService from . document_embeddings_store_service import DocumentEmbeddingsStoreService diff --git a/trustgraph-base/trustgraph/base/backend.py b/trustgraph-base/trustgraph/base/backend.py index f0d6b397..0f95ca1b 100644 --- a/trustgraph-base/trustgraph/base/backend.py +++ b/trustgraph-base/trustgraph/base/backend.py @@ -159,6 +159,62 @@ class PubSubBackend(Protocol): """ ... + async def create_queue(self, topic: str, subscription: str) -> None: + """ + Pre-create a queue so it exists before any consumer connects. + + The topic and subscription together identify the queue, mirroring + create_consumer where the queue name is derived from both. + + Idempotent — creating an already-existing queue succeeds silently. + + Args: + topic: Queue identifier in class:topicspace:topic format + subscription: Subscription/consumer group name + """ + ... + + async def delete_queue(self, topic: str, subscription: str) -> None: + """ + Delete a queue and any messages it contains. + + The topic and subscription together identify the queue, mirroring + create_consumer where the queue name is derived from both. + + Idempotent — deleting a non-existent queue succeeds silently. + + Args: + topic: Queue identifier in class:topicspace:topic format + subscription: Subscription/consumer group name + """ + ... + + async def queue_exists(self, topic: str, subscription: str) -> bool: + """ + Check whether a queue exists. + + Args: + topic: Queue identifier in class:topicspace:topic format + subscription: Subscription/consumer group name + + Returns: + True if the queue exists, False otherwise. + """ + ... + + async def ensure_queue(self, topic: str, subscription: str) -> None: + """ + Ensure a queue exists, creating it if necessary. + + Convenience wrapper — checks existence, creates if missing. + Used by system services on startup. + + Args: + topic: Queue identifier in class:topicspace:topic format + subscription: Subscription/consumer group name + """ + ... + def close(self) -> None: """Close the backend connection.""" ... diff --git a/trustgraph-base/trustgraph/base/config_client.py b/trustgraph-base/trustgraph/base/config_client.py new file mode 100644 index 00000000..c9ec3f9b --- /dev/null +++ b/trustgraph-base/trustgraph/base/config_client.py @@ -0,0 +1,92 @@ + +from . request_response_spec import RequestResponse, RequestResponseSpec +from .. schema import ConfigRequest, ConfigResponse, ConfigKey, ConfigValue + +CONFIG_TIMEOUT = 10 + + +class ConfigClient(RequestResponse): + + async def _request(self, timeout=CONFIG_TIMEOUT, **kwargs): + resp = await self.request( + ConfigRequest(**kwargs), + timeout=timeout, + ) + if resp.error: + raise RuntimeError( + f"{resp.error.type}: {resp.error.message}" + ) + return resp + + async def get(self, type, key, timeout=CONFIG_TIMEOUT): + """Get a single config value. Returns the value string or None.""" + resp = await self._request( + operation="get", + keys=[ConfigKey(type=type, key=key)], + timeout=timeout, + ) + if resp.values and len(resp.values) > 0: + return resp.values[0].value + return None + + async def put(self, type, key, value, timeout=CONFIG_TIMEOUT): + """Put a single config value.""" + await self._request( + operation="put", + values=[ConfigValue(type=type, key=key, value=value)], + timeout=timeout, + ) + + async def put_many(self, values, timeout=CONFIG_TIMEOUT): + """Put multiple config values in a single request. + values is a list of (type, key, value) tuples.""" + await self._request( + operation="put", + values=[ + ConfigValue(type=t, key=k, value=v) + for t, k, v in values + ], + timeout=timeout, + ) + + async def delete(self, type, key, timeout=CONFIG_TIMEOUT): + """Delete a single config key.""" + await self._request( + operation="delete", + keys=[ConfigKey(type=type, key=key)], + timeout=timeout, + ) + + async def delete_many(self, keys, timeout=CONFIG_TIMEOUT): + """Delete multiple config keys in a single request. + keys is a list of (type, key) tuples.""" + await self._request( + operation="delete", + keys=[ + ConfigKey(type=t, key=k) + for t, k in keys + ], + timeout=timeout, + ) + + async def keys(self, type, timeout=CONFIG_TIMEOUT): + """List all keys for a config type.""" + resp = await self._request( + operation="list", + type=type, + timeout=timeout, + ) + return resp.directory + + +class ConfigClientSpec(RequestResponseSpec): + def __init__( + self, request_name, response_name, + ): + super(ConfigClientSpec, self).__init__( + request_name=request_name, + request_schema=ConfigRequest, + response_name=response_name, + response_schema=ConfigResponse, + impl=ConfigClient, + ) diff --git a/trustgraph-base/trustgraph/base/consumer_spec.py b/trustgraph-base/trustgraph/base/consumer_spec.py index ae218b44..023537df 100644 --- a/trustgraph-base/trustgraph/base/consumer_spec.py +++ b/trustgraph-base/trustgraph/base/consumer_spec.py @@ -23,7 +23,7 @@ class ConsumerSpec(Spec): taskgroup = processor.taskgroup, flow = flow, backend = processor.pubsub, - topic = definition[self.name], + topic = definition["topics"][self.name], subscriber = processor.id + "--" + flow.name + "--" + self.name, schema = self.schema, handler = self.handler, diff --git a/trustgraph-base/trustgraph/base/flow_processor.py b/trustgraph-base/trustgraph/base/flow_processor.py index edf974b3..99cb0f53 100644 --- a/trustgraph-base/trustgraph/base/flow_processor.py +++ b/trustgraph-base/trustgraph/base/flow_processor.py @@ -29,9 +29,9 @@ class FlowProcessor(AsyncProcessor): # Initialise base class super(FlowProcessor, self).__init__(**params) - # Register configuration handler + # Register configuration handler for this processor's config type self.register_config_handler( - self.on_configure_flows, types=["active-flow"] + self.on_configure_flows, types=[f"processor:{self.id}"] ) # Initialise flow information state @@ -66,17 +66,16 @@ class FlowProcessor(AsyncProcessor): logger.info(f"Got config version {version}") - # Skip over invalid data - if "active-flow" not in config: return - - # Check there's configuration information for me - if self.id in config["active-flow"]: - - # Get my flow config - flow_config = json.loads(config["active-flow"][self.id]) + config_type = f"processor:{self.id}" + # Get my flow config — each key is a variant, each value is + # the JSON config for that flow variant + if config_type in config: + flow_config = { + k: json.loads(v) + for k, v in config[config_type].items() + } else: - logger.debug("No configuration settings for me.") flow_config = {} diff --git a/trustgraph-base/trustgraph/base/parameter_spec.py b/trustgraph-base/trustgraph/base/parameter_spec.py index 4c6dfb21..7b3a3b26 100644 --- a/trustgraph-base/trustgraph/base/parameter_spec.py +++ b/trustgraph-base/trustgraph/base/parameter_spec.py @@ -18,6 +18,6 @@ class ParameterSpec(Spec): def add(self, flow: Any, processor: Any, definition: dict[str, Any]) -> None: - value = definition.get(self.name, None) + value = definition.get("parameters", {}).get(self.name, None) flow.parameter[self.name] = Parameter(value) diff --git a/trustgraph-base/trustgraph/base/producer_spec.py b/trustgraph-base/trustgraph/base/producer_spec.py index 7e77ef35..16905f4b 100644 --- a/trustgraph-base/trustgraph/base/producer_spec.py +++ b/trustgraph-base/trustgraph/base/producer_spec.py @@ -19,7 +19,7 @@ class ProducerSpec(Spec): producer = Producer( backend = processor.pubsub, - topic = definition[self.name], + topic = definition["topics"][self.name], schema = self.schema, metrics = producer_metrics, ) diff --git a/trustgraph-base/trustgraph/base/pulsar_backend.py b/trustgraph-base/trustgraph/base/pulsar_backend.py index 6f125399..2100483d 100644 --- a/trustgraph-base/trustgraph/base/pulsar_backend.py +++ b/trustgraph-base/trustgraph/base/pulsar_backend.py @@ -266,6 +266,26 @@ class PulsarBackend: return PulsarBackendConsumer(pulsar_consumer, schema) + async def create_queue(self, topic: str, subscription: str) -> None: + """No-op — Pulsar auto-creates topics on first use. + TODO: Use admin REST API for explicit persistent topic creation.""" + pass + + async def delete_queue(self, topic: str, subscription: str) -> None: + """No-op — to be replaced with admin REST API calls. + TODO: Delete subscription and persistent topic via admin API.""" + pass + + async def queue_exists(self, topic: str, subscription: str) -> bool: + """Returns True — Pulsar auto-creates on subscribe. + TODO: Use admin REST API for actual existence check.""" + return True + + async def ensure_queue(self, topic: str, subscription: str) -> None: + """No-op — Pulsar auto-creates topics on first use. + TODO: Use admin REST API for explicit creation.""" + pass + def close(self) -> None: """Close the Pulsar client.""" self.client.close() diff --git a/trustgraph-base/trustgraph/base/rabbitmq_backend.py b/trustgraph-base/trustgraph/base/rabbitmq_backend.py index 7de51a0a..43c717c3 100644 --- a/trustgraph-base/trustgraph/base/rabbitmq_backend.py +++ b/trustgraph-base/trustgraph/base/rabbitmq_backend.py @@ -19,6 +19,7 @@ Uses basic_consume (push) instead of basic_get (polling) for efficient message delivery. """ +import asyncio import json import time import logging @@ -170,28 +171,37 @@ class RabbitMQBackendConsumer: self._connection = pika.BlockingConnection(self._connection_params) self._channel = self._connection.channel() - # Declare the topic exchange + # Declare the topic exchange (idempotent, also done by producers) self._channel.exchange_declare( exchange=self._exchange_name, exchange_type='topic', durable=True, ) - # Declare the queue — anonymous if exclusive - result = self._channel.queue_declare( - queue=self._queue_name, - durable=self._durable, - exclusive=self._exclusive, - auto_delete=self._auto_delete, - ) - # Capture actual name (important for anonymous queues where name='') - self._queue_name = result.method.queue + if self._exclusive: + # Anonymous ephemeral queue (response/notify class). + # These are per-consumer and must be created here — the + # broker assigns the name. + result = self._channel.queue_declare( + queue='', + durable=False, + exclusive=True, + auto_delete=True, + ) + self._queue_name = result.method.queue - self._channel.queue_bind( - queue=self._queue_name, - exchange=self._exchange_name, - routing_key=self._routing_key, - ) + self._channel.queue_bind( + queue=self._queue_name, + exchange=self._exchange_name, + routing_key=self._routing_key, + ) + else: + # Named queue (flow/request class). Queue must already + # exist — created by the flow service or ensure_queue. + # We just verify it exists and bind to consume. + self._channel.queue_declare( + queue=self._queue_name, passive=True, + ) self._channel.basic_qos(prefetch_count=1) @@ -409,5 +419,124 @@ class RabbitMQBackend: queue_name, schema, queue_durable, exclusive, auto_delete, ) + def _create_queue_sync(self, exchange, routing_key, queue_name, durable): + """Blocking queue creation — run via asyncio.to_thread.""" + connection = None + try: + connection = pika.BlockingConnection(self._connection_params) + channel = connection.channel() + channel.exchange_declare( + exchange=exchange, + exchange_type='topic', + durable=True, + ) + channel.queue_declare( + queue=queue_name, + durable=durable, + exclusive=False, + auto_delete=False, + ) + channel.queue_bind( + queue=queue_name, + exchange=exchange, + routing_key=routing_key, + ) + logger.info(f"Created queue: {queue_name}") + finally: + if connection and connection.is_open: + try: + connection.close() + except Exception: + pass + + async def create_queue(self, topic: str, subscription: str) -> None: + """Pre-create a named queue bound to the topic exchange. + + Only applies to shared queues (flow/request class). Response and + notify queues are anonymous/auto-delete and created by consumers. + """ + exchange, routing_key, cls, durable = self._parse_queue_id(topic) + + if cls in ('response', 'notify'): + return + + queue_name = f"{exchange}.{routing_key}.{subscription}" + await asyncio.to_thread( + self._create_queue_sync, exchange, routing_key, + queue_name, durable, + ) + + def _delete_queue_sync(self, queue_name): + """Blocking queue deletion — run via asyncio.to_thread.""" + connection = None + try: + connection = pika.BlockingConnection(self._connection_params) + channel = connection.channel() + channel.queue_delete(queue=queue_name) + logger.info(f"Deleted queue: {queue_name}") + except Exception as e: + # Idempotent — queue may already be gone + logger.debug(f"Queue delete for {queue_name}: {e}") + finally: + if connection and connection.is_open: + try: + connection.close() + except Exception: + pass + + async def delete_queue(self, topic: str, subscription: str) -> None: + """Delete a named queue and any messages it contains. + + Only applies to shared queues (flow/request class). Response and + notify queues are anonymous/auto-delete and managed by the broker. + """ + exchange, routing_key, cls, durable = self._parse_queue_id(topic) + + if cls in ('response', 'notify'): + return + + queue_name = f"{exchange}.{routing_key}.{subscription}" + await asyncio.to_thread(self._delete_queue_sync, queue_name) + + def _queue_exists_sync(self, queue_name): + """Blocking queue existence check — run via asyncio.to_thread. + Uses passive=True which checks without creating.""" + connection = None + try: + connection = pika.BlockingConnection(self._connection_params) + channel = connection.channel() + channel.queue_declare(queue=queue_name, passive=True) + return True + except pika.exceptions.ChannelClosedByBroker: + # 404 NOT_FOUND — queue does not exist + return False + finally: + if connection and connection.is_open: + try: + connection.close() + except Exception: + pass + + async def queue_exists(self, topic: str, subscription: str) -> bool: + """Check whether a named queue exists. + + Only applies to shared queues (flow/request class). Response and + notify queues are anonymous/ephemeral — always returns False. + """ + exchange, routing_key, cls, durable = self._parse_queue_id(topic) + + if cls in ('response', 'notify'): + return False + + queue_name = f"{exchange}.{routing_key}.{subscription}" + return await asyncio.to_thread( + self._queue_exists_sync, queue_name + ) + + async def ensure_queue(self, topic: str, subscription: str) -> None: + """Ensure a queue exists, creating it if necessary.""" + if not await self.queue_exists(topic, subscription): + await self.create_queue(topic, subscription) + def close(self) -> None: pass diff --git a/trustgraph-base/trustgraph/base/request_response_spec.py b/trustgraph-base/trustgraph/base/request_response_spec.py index d19aae10..b91c655c 100644 --- a/trustgraph-base/trustgraph/base/request_response_spec.py +++ b/trustgraph-base/trustgraph/base/request_response_spec.py @@ -137,10 +137,10 @@ class RequestResponseSpec(Spec): "--" + str(uuid.uuid4()) ), consumer_name = flow.id, - request_topic = definition[self.request_name], + request_topic = definition["topics"][self.request_name], request_schema = self.request_schema, request_metrics = request_metrics, - response_topic = definition[self.response_name], + response_topic = definition["topics"][self.response_name], response_schema = self.response_schema, response_metrics = response_metrics, ) diff --git a/trustgraph-base/trustgraph/base/subscriber_spec.py b/trustgraph-base/trustgraph/base/subscriber_spec.py index 39b852e5..bf35f869 100644 --- a/trustgraph-base/trustgraph/base/subscriber_spec.py +++ b/trustgraph-base/trustgraph/base/subscriber_spec.py @@ -20,7 +20,7 @@ class SubscriberSpec(Spec): subscriber = Subscriber( backend = processor.pubsub, - topic = definition[self.name], + topic = definition["topics"][self.name], subscription = flow.id, consumer_name = flow.id, schema = self.schema, diff --git a/trustgraph-cli/trustgraph/cli/show_flows.py b/trustgraph-cli/trustgraph/cli/show_flows.py index d1abf984..f7a14469 100644 --- a/trustgraph-cli/trustgraph/cli/show_flows.py +++ b/trustgraph-cli/trustgraph/cli/show_flows.py @@ -33,7 +33,7 @@ def describe_interfaces(intdefs, flow): lst.append(f"{k} response: {resp}") if kind == "send": - q = intfs[k] + q = intfs[k]["flow"] lst.append(f"{k}: {q}") diff --git a/trustgraph-flow/pyproject.toml b/trustgraph-flow/pyproject.toml index 14e919c0..492af385 100644 --- a/trustgraph-flow/pyproject.toml +++ b/trustgraph-flow/pyproject.toml @@ -61,6 +61,7 @@ api-gateway = "trustgraph.gateway:run" chunker-recursive = "trustgraph.chunking.recursive:run" chunker-token = "trustgraph.chunking.token:run" config-svc = "trustgraph.config.service:run" +flow-svc = "trustgraph.flow.service:run" doc-embeddings-query-milvus = "trustgraph.query.doc_embeddings.milvus:run" doc-embeddings-query-pinecone = "trustgraph.query.doc_embeddings.pinecone:run" doc-embeddings-query-qdrant = "trustgraph.query.doc_embeddings.qdrant:run" diff --git a/trustgraph-flow/trustgraph/config/service/service.py b/trustgraph-flow/trustgraph/config/service/service.py index 5c235bb2..75232315 100644 --- a/trustgraph-flow/trustgraph/config/service/service.py +++ b/trustgraph-flow/trustgraph/config/service/service.py @@ -11,14 +11,10 @@ from trustgraph.schema import ConfigRequest, ConfigResponse, ConfigPush from trustgraph.schema import config_request_queue, config_response_queue from trustgraph.schema import config_push_queue -from trustgraph.schema import FlowRequest, FlowResponse -from trustgraph.schema import flow_request_queue, flow_response_queue - from trustgraph.base import AsyncProcessor, Consumer, Producer from trustgraph.base.cassandra_config import add_cassandra_args, resolve_cassandra_config from . config import Configuration -from . flow import FlowConfig from ... base import ProcessorMetrics, ConsumerMetrics, ProducerMetrics from ... base import Consumer, Producer @@ -32,9 +28,6 @@ default_config_request_queue = config_request_queue default_config_response_queue = config_response_queue default_config_push_queue = config_push_queue -default_flow_request_queue = flow_request_queue -default_flow_response_queue = flow_response_queue - default_cassandra_host = "cassandra" class Processor(AsyncProcessor): @@ -51,13 +44,6 @@ class Processor(AsyncProcessor): "config_push_queue", default_config_push_queue ) - flow_request_queue = params.get( - "flow_request_queue", default_flow_request_queue - ) - flow_response_queue = params.get( - "flow_response_queue", default_flow_response_queue - ) - cassandra_host = params.get("cassandra_host") cassandra_username = params.get("cassandra_username") cassandra_password = params.get("cassandra_password") @@ -77,16 +63,11 @@ class Processor(AsyncProcessor): id = params.get("id") - flow_request_schema = FlowRequest - flow_response_schema = FlowResponse - super(Processor, self).__init__( **params | { "config_request_schema": ConfigRequest.__name__, "config_response_schema": ConfigResponse.__name__, "config_push_schema": ConfigPush.__name__, - "flow_request_schema": FlowRequest.__name__, - "flow_response_schema": FlowResponse.__name__, "cassandra_host": self.cassandra_host, "cassandra_username": self.cassandra_username, "cassandra_password": self.cassandra_password, @@ -103,12 +84,8 @@ class Processor(AsyncProcessor): processor = self.id, flow = None, name = "config-push" ) - flow_request_metrics = ConsumerMetrics( - processor = self.id, flow = None, name = "flow-request" - ) - flow_response_metrics = ProducerMetrics( - processor = self.id, flow = None, name = "flow-response" - ) + self.config_request_topic = config_request_queue + self.config_request_subscriber = id self.config_request_consumer = Consumer( taskgroup = self.taskgroup, @@ -135,24 +112,6 @@ class Processor(AsyncProcessor): metrics = config_push_metrics, ) - self.flow_request_consumer = Consumer( - taskgroup = self.taskgroup, - backend = self.pubsub, - flow = None, - topic = flow_request_queue, - subscriber = id, - schema = FlowRequest, - handler = self.on_flow_request, - metrics = flow_request_metrics, - ) - - self.flow_response_producer = Producer( - backend = self.pubsub, - topic = flow_response_queue, - schema = FlowResponse, - metrics = flow_response_metrics, - ) - self.config = Configuration( host = self.cassandra_host, username = self.cassandra_username, @@ -161,15 +120,15 @@ class Processor(AsyncProcessor): push = self.push ) - self.flow = FlowConfig(self.config) - logger.info("Config service initialized") async def start(self): + await self.pubsub.ensure_queue( + self.config_request_topic, self.config_request_subscriber + ) await self.push() # Startup poke: empty types = everything await self.config_request_consumer.start() - await self.flow_request_consumer.start() async def push(self, types=None): @@ -193,7 +152,7 @@ class Processor(AsyncProcessor): # Sender-produced ID id = msg.properties()["id"] - logger.info(f"Handling config request {id}...") + logger.debug(f"Handling config request {id}...") resp = await self.config.handle(v) @@ -214,36 +173,6 @@ class Processor(AsyncProcessor): resp, properties={"id": id} ) - async def on_flow_request(self, msg, consumer, flow): - - try: - - v = msg.value() - - # Sender-produced ID - id = msg.properties()["id"] - - logger.info(f"Handling flow request {id}...") - - resp = await self.flow.handle(v) - - await self.flow_response_producer.send( - resp, properties={"id": id} - ) - - except Exception as e: - - resp = FlowResponse( - error=Error( - type = "flow-error", - message = str(e), - ), - ) - - await self.flow_response_producer.send( - resp, properties={"id": id} - ) - @staticmethod def add_args(parser): @@ -263,18 +192,6 @@ class Processor(AsyncProcessor): # Note: --config-push-queue is already added by AsyncProcessor.add_args() - parser.add_argument( - '--flow-request-queue', - default=default_flow_request_queue, - help=f'Flow request queue (default: {default_flow_request_queue})' - ) - - parser.add_argument( - '--flow-response-queue', - default=default_flow_response_queue, - help=f'Flow response queue {default_flow_response_queue}', - ) - add_cassandra_args(parser) def run(): diff --git a/trustgraph-flow/trustgraph/cores/knowledge.py b/trustgraph-flow/trustgraph/cores/knowledge.py index 0d5c3d82..d03d4ed6 100644 --- a/trustgraph-flow/trustgraph/cores/knowledge.py +++ b/trustgraph-flow/trustgraph/cores/knowledge.py @@ -192,8 +192,8 @@ class KnowledgeManager: if "graph-embeddings-store" not in flow["interfaces"]: raise RuntimeError("Flow has no graph-embeddings-store") - t_q = flow["interfaces"]["triples-store"] - ge_q = flow["interfaces"]["graph-embeddings-store"] + t_q = flow["interfaces"]["triples-store"]["flow"] + ge_q = flow["interfaces"]["graph-embeddings-store"]["flow"] # Got this far, it should all work await respond( diff --git a/trustgraph-flow/trustgraph/cores/service.py b/trustgraph-flow/trustgraph/cores/service.py index d6390805..400f96d1 100755 --- a/trustgraph-flow/trustgraph/cores/service.py +++ b/trustgraph-flow/trustgraph/cores/service.py @@ -82,6 +82,9 @@ class Processor(AsyncProcessor): processor = self.id, flow = None, name = "knowledge-response" ) + self.knowledge_request_topic = knowledge_request_queue + self.knowledge_request_subscriber = id + self.knowledge_request_consumer = Consumer( taskgroup = self.taskgroup, backend = self.pubsub, @@ -116,6 +119,9 @@ class Processor(AsyncProcessor): async def start(self): + await self.pubsub.ensure_queue( + self.knowledge_request_topic, self.knowledge_request_subscriber + ) await super(Processor, self).start() await self.knowledge_request_consumer.start() await self.knowledge_response_producer.start() diff --git a/trustgraph-flow/trustgraph/flow/__init__.py b/trustgraph-flow/trustgraph/flow/__init__.py new file mode 100644 index 00000000..214f7272 --- /dev/null +++ b/trustgraph-flow/trustgraph/flow/__init__.py @@ -0,0 +1,2 @@ + +from . service import * diff --git a/trustgraph-flow/trustgraph/flow/service/__init__.py b/trustgraph-flow/trustgraph/flow/service/__init__.py new file mode 100644 index 00000000..214f7272 --- /dev/null +++ b/trustgraph-flow/trustgraph/flow/service/__init__.py @@ -0,0 +1,2 @@ + +from . service import * diff --git a/trustgraph-flow/trustgraph/flow/service/__main__.py b/trustgraph-flow/trustgraph/flow/service/__main__.py new file mode 100644 index 00000000..da5a9021 --- /dev/null +++ b/trustgraph-flow/trustgraph/flow/service/__main__.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from . service import run + +if __name__ == '__main__': + run() diff --git a/trustgraph-flow/trustgraph/config/service/flow.py b/trustgraph-flow/trustgraph/flow/service/flow.py similarity index 57% rename from trustgraph-flow/trustgraph/config/service/flow.py rename to trustgraph-flow/trustgraph/flow/service/flow.py index 775c8b4e..477c6a2c 100644 --- a/trustgraph-flow/trustgraph/config/service/flow.py +++ b/trustgraph-flow/trustgraph/flow/service/flow.py @@ -1,15 +1,22 @@ from trustgraph.schema import FlowResponse, Error +import asyncio import json import logging # Module logger logger = logging.getLogger(__name__) +# Queue deletion retry settings +DELETE_RETRIES = 5 +DELETE_RETRY_DELAY = 2 # seconds + + class FlowConfig: - def __init__(self, config): + def __init__(self, config, pubsub): self.config = config + self.pubsub = pubsub # Cache for parameter type definitions to avoid repeated lookups self.param_type_cache = {} @@ -22,9 +29,12 @@ class FlowConfig: user_params: User-provided parameters dict (may be None or empty) Returns: - Complete parameter dict with user values and defaults merged (all values as strings) + Complete parameter dict with user values and defaults merged + (all values as strings) """ + # If the flow blueprint has no parameters section, return user params as-is (stringified) + if "parameters" not in flow_blueprint: if not user_params: return {} @@ -49,7 +59,9 @@ class FlowConfig: if param_type not in self.param_type_cache: try: # Fetch parameter type definition from config store - type_def = await self.config.get("parameter-type").get(param_type) + type_def = await self.config.get( + "parameter-type", param_type + ) if type_def: self.param_type_cache[param_type] = json.loads(type_def) else: @@ -102,32 +114,29 @@ class FlowConfig: async def handle_list_blueprints(self, msg): - names = list(await self.config.get("flow-blueprint").keys()) + names = list(await self.config.keys("flow-blueprint")) return FlowResponse( error = None, blueprint_names = names, ) - + async def handle_get_blueprint(self, msg): return FlowResponse( error = None, blueprint_definition = await self.config.get( - "flow-blueprint" - ).get(msg.blueprint_name), + "flow-blueprint", msg.blueprint_name + ), ) - + async def handle_put_blueprint(self, msg): - await self.config.get("flow-blueprint").put( + await self.config.put( + "flow-blueprint", msg.blueprint_name, msg.blueprint_definition ) - await self.config.inc_version() - - await self.config.push(types=["flow-blueprint"]) - return FlowResponse( error = None, ) @@ -136,28 +145,24 @@ class FlowConfig: logger.debug(f"Flow config message: {msg}") - await self.config.get("flow-blueprint").delete(msg.blueprint_name) - - await self.config.inc_version() - - await self.config.push(types=["flow-blueprint"]) + await self.config.delete("flow-blueprint", msg.blueprint_name) return FlowResponse( error = None, ) - + async def handle_list_flows(self, msg): - names = list(await self.config.get("flow").keys()) + names = list(await self.config.keys("flow")) return FlowResponse( error = None, flow_ids = names, ) - + async def handle_get_flow(self, msg): - flow_data = await self.config.get("flow").get(msg.flow_id) + flow_data = await self.config.get("flow", msg.flow_id) flow = json.loads(flow_data) return FlowResponse( @@ -166,7 +171,7 @@ class FlowConfig: description = flow.get("description", ""), parameters = flow.get("parameters", {}), ) - + async def handle_start_flow(self, msg): if msg.blueprint_name is None: @@ -175,17 +180,17 @@ class FlowConfig: if msg.flow_id is None: raise RuntimeError("No flow ID") - if msg.flow_id in await self.config.get("flow").keys(): + if msg.flow_id in await self.config.keys("flow"): raise RuntimeError("Flow already exists") if msg.description is None: raise RuntimeError("No description") - if msg.blueprint_name not in await self.config.get("flow-blueprint").keys(): + if msg.blueprint_name not in await self.config.keys("flow-blueprint"): raise RuntimeError("Blueprint does not exist") cls = json.loads( - await self.config.get("flow-blueprint").get(msg.blueprint_name) + await self.config.get("flow-blueprint", msg.blueprint_name) ) # Resolve parameters by merging user-provided values with defaults @@ -210,6 +215,15 @@ class FlowConfig: return result + # Pre-create flow-level queues so the data path is wired + # before processors receive their config and start connecting. + queues = self._collect_flow_queues(cls, repl_template_with_params) + for topic, subscription in queues: + await self.pubsub.create_queue(topic, subscription) + + # Build all processor config updates, then write in a single batch. + updates = [] + for kind in ("blueprint", "flow"): for k, v in cls[kind].items(): @@ -218,37 +232,34 @@ class FlowConfig: variant = repl_template_with_params(variant) - v = { + topics = { repl_template_with_params(k2): repl_template_with_params(v2) - for k2, v2 in v.items() + for k2, v2 in v.get("topics", {}).items() } - flac = await self.config.get("active-flow").get(processor) - if flac is not None: - target = json.loads(flac) - else: - target = {} + params = { + repl_template_with_params(k2): repl_template_with_params(v2) + for k2, v2 in v.get("parameters", {}).items() + } - # The condition if variant not in target: means it only adds - # the configuration if the variant doesn't already exist. - # If "everything" already exists in the target with old - # values, they won't update. + entry = { + "topics": topics, + "parameters": params, + } - if variant not in target: - target[variant] = v + updates.append(( + f"processor:{processor}", + variant, + json.dumps(entry), + )) - await self.config.get("active-flow").put( - processor, json.dumps(target) - ) + await self.config.put_many(updates) def repl_interface(i): - if isinstance(i, str): - return repl_template_with_params(i) - else: - return { - k: repl_template_with_params(v) - for k, v in i.items() - } + return { + k: repl_template_with_params(v) + for k, v in i.items() + } if "interfaces" in cls: interfaces = { @@ -258,8 +269,8 @@ class FlowConfig: else: interfaces = {} - await self.config.get("flow").put( - msg.flow_id, + await self.config.put( + "flow", msg.flow_id, json.dumps({ "description": msg.description, "blueprint-name": msg.blueprint_name, @@ -268,23 +279,131 @@ class FlowConfig: }) ) - await self.config.inc_version() - - await self.config.push(types=["active-flow", "flow"]) - return FlowResponse( error = None, ) - + + async def ensure_existing_flow_queues(self): + """Ensure queues exist for all already-running flows. + + Called on startup to handle flows that were started before this + version of the flow service was deployed, or before a restart. + """ + flow_ids = await self.config.keys("flow") + + for flow_id in flow_ids: + try: + flow_data = await self.config.get("flow", flow_id) + if flow_data is None: + continue + + flow = json.loads(flow_data) + + blueprint_name = flow.get("blueprint-name") + if blueprint_name is None: + continue + + # Skip flows that are mid-shutdown + if flow.get("status") == "stopping": + continue + + parameters = flow.get("parameters", {}) + + blueprint_data = await self.config.get( + "flow-blueprint", blueprint_name + ) + if blueprint_data is None: + logger.warning( + f"Blueprint '{blueprint_name}' not found for " + f"flow '{flow_id}', skipping queue creation" + ) + continue + + cls = json.loads(blueprint_data) + + def repl_template(tmp): + result = tmp.replace( + "{blueprint}", blueprint_name + ).replace( + "{id}", flow_id + ) + for param_name, param_value in parameters.items(): + result = result.replace( + f"{{{param_name}}}", str(param_value) + ) + return result + + queues = self._collect_flow_queues(cls, repl_template) + for topic, subscription in queues: + await self.pubsub.ensure_queue(topic, subscription) + + logger.info( + f"Ensured queues for existing flow '{flow_id}'" + ) + + except Exception as e: + logger.error( + f"Failed to ensure queues for flow '{flow_id}': {e}" + ) + + def _collect_flow_queues(self, cls, repl_template): + """Collect (topic, subscription) pairs for all flow-level queues. + + Iterates the blueprint's "flow" section and reads only the + "topics" dict from each processor entry. + """ + queues = [] + + for k, v in cls["flow"].items(): + processor, variant = k.split(":", 1) + variant = repl_template(variant) + + for spec_name, topic_template in v.get("topics", {}).items(): + topic = repl_template(topic_template) + subscription = f"{processor}--{variant}--{spec_name}" + queues.append((topic, subscription)) + + return queues + + async def _delete_queues(self, queues): + """Delete queues with retries. Best-effort — logs failures but + does not raise.""" + for attempt in range(DELETE_RETRIES): + remaining = [] + + for topic, subscription in queues: + try: + await self.pubsub.delete_queue(topic, subscription) + except Exception as e: + logger.warning( + f"Queue delete failed (attempt {attempt + 1}/" + f"{DELETE_RETRIES}): {topic}: {e}" + ) + remaining.append((topic, subscription)) + + if not remaining: + return + + queues = remaining + + if attempt < DELETE_RETRIES - 1: + await asyncio.sleep(DELETE_RETRY_DELAY) + + for topic, subscription in queues: + logger.error( + f"Failed to delete queue after {DELETE_RETRIES} " + f"attempts: {topic}" + ) + async def handle_stop_flow(self, msg): if msg.flow_id is None: raise RuntimeError("No flow ID") - if msg.flow_id not in await self.config.get("flow").keys(): + if msg.flow_id not in await self.config.keys("flow"): raise RuntimeError("Flow ID invalid") - flow = json.loads(await self.config.get("flow").get(msg.flow_id)) + flow = json.loads(await self.config.get("flow", msg.flow_id)) if "blueprint-name" not in flow: raise RuntimeError("Internal error: flow has no flow blueprint") @@ -292,7 +411,9 @@ class FlowConfig: blueprint_name = flow["blueprint-name"] parameters = flow.get("parameters", {}) - cls = json.loads(await self.config.get("flow-blueprint").get(blueprint_name)) + cls = json.loads( + await self.config.get("flow-blueprint", blueprint_name) + ) def repl_template(tmp): result = tmp.replace( @@ -305,34 +426,33 @@ class FlowConfig: result = result.replace(f"{{{param_name}}}", str(param_value)) return result - for kind in ("flow",): + # Collect queue identifiers before removing config + queues = self._collect_flow_queues(cls, repl_template) - for k, v in cls[kind].items(): + # Phase 1: Set status to "stopping" and remove processor config. + # The config push tells processors to shut down their consumers. + flow["status"] = "stopping" + await self.config.put( + "flow", msg.flow_id, json.dumps(flow) + ) - processor, variant = k.split(":", 1) + # Delete all processor config entries for this flow. + deletes = [] - variant = repl_template(variant) + for k, v in cls["flow"].items(): - flac = await self.config.get("active-flow").get(processor) + processor, variant = k.split(":", 1) + variant = repl_template(variant) - if flac is not None: - target = json.loads(flac) - else: - target = {} + deletes.append((f"processor:{processor}", variant)) - if variant in target: - del target[variant] + await self.config.delete_many(deletes) - await self.config.get("active-flow").put( - processor, json.dumps(target) - ) + # Phase 2: Delete queues with retries, then remove the flow record. + await self._delete_queues(queues) - if msg.flow_id in await self.config.get("flow").keys(): - await self.config.get("flow").delete(msg.flow_id) - - await self.config.inc_version() - - await self.config.push(types=["active-flow", "flow"]) + if msg.flow_id in await self.config.keys("flow"): + await self.config.delete("flow", msg.flow_id) return FlowResponse( error = None, @@ -368,4 +488,3 @@ class FlowConfig: ) return resp - diff --git a/trustgraph-flow/trustgraph/flow/service/service.py b/trustgraph-flow/trustgraph/flow/service/service.py new file mode 100644 index 00000000..a3f2fb6b --- /dev/null +++ b/trustgraph-flow/trustgraph/flow/service/service.py @@ -0,0 +1,162 @@ + +""" +Flow service. Manages flow lifecycle — starting and stopping flows +by coordinating with the config service via pub/sub. +""" + +import logging + +from trustgraph.schema import Error + +from trustgraph.schema import FlowRequest, FlowResponse +from trustgraph.schema import flow_request_queue, flow_response_queue +from trustgraph.schema import ConfigRequest, ConfigResponse +from trustgraph.schema import config_request_queue, config_response_queue + +from trustgraph.base import AsyncProcessor, Consumer, Producer +from trustgraph.base import ConsumerMetrics, ProducerMetrics, SubscriberMetrics +from trustgraph.base import ConfigClient + +from . flow import FlowConfig + +# Module logger +logger = logging.getLogger(__name__) + +default_ident = "flow-svc" + +default_flow_request_queue = flow_request_queue +default_flow_response_queue = flow_response_queue + + +class Processor(AsyncProcessor): + + def __init__(self, **params): + + flow_request_queue = params.get( + "flow_request_queue", default_flow_request_queue + ) + flow_response_queue = params.get( + "flow_response_queue", default_flow_response_queue + ) + + id = params.get("id") + + super(Processor, self).__init__( + **params | { + "flow_request_schema": FlowRequest.__name__, + "flow_response_schema": FlowResponse.__name__, + } + ) + + flow_request_metrics = ConsumerMetrics( + processor = self.id, flow = None, name = "flow-request" + ) + flow_response_metrics = ProducerMetrics( + processor = self.id, flow = None, name = "flow-response" + ) + + self.flow_request_topic = flow_request_queue + self.flow_request_subscriber = id + + self.flow_request_consumer = Consumer( + taskgroup = self.taskgroup, + backend = self.pubsub, + flow = None, + topic = flow_request_queue, + subscriber = id, + schema = FlowRequest, + handler = self.on_flow_request, + metrics = flow_request_metrics, + ) + + self.flow_response_producer = Producer( + backend = self.pubsub, + topic = flow_response_queue, + schema = FlowResponse, + metrics = flow_response_metrics, + ) + + config_req_metrics = ProducerMetrics( + processor=self.id, flow=None, name="config-request", + ) + config_resp_metrics = SubscriberMetrics( + processor=self.id, flow=None, name="config-response", + ) + + self.config_client = ConfigClient( + backend=self.pubsub, + subscription=f"{self.id}--config--{id}", + consumer_name=self.id, + request_topic=config_request_queue, + request_schema=ConfigRequest, + request_metrics=config_req_metrics, + response_topic=config_response_queue, + response_schema=ConfigResponse, + response_metrics=config_resp_metrics, + ) + + self.flow = FlowConfig(self.config_client, self.pubsub) + + logger.info("Flow service initialized") + + async def start(self): + + await self.pubsub.ensure_queue( + self.flow_request_topic, self.flow_request_subscriber + ) + await self.config_client.start() + await self.flow.ensure_existing_flow_queues() + await self.flow_request_consumer.start() + + async def on_flow_request(self, msg, consumer, flow): + + try: + + v = msg.value() + + # Sender-produced ID + id = msg.properties()["id"] + + logger.debug(f"Handling flow request {id}...") + + resp = await self.flow.handle(v) + + await self.flow_response_producer.send( + resp, properties={"id": id} + ) + + except Exception as e: + + logger.error(f"Flow request failed: {e}") + + resp = FlowResponse( + error=Error( + type = "flow-error", + message = str(e), + ), + ) + + await self.flow_response_producer.send( + resp, properties={"id": id} + ) + + @staticmethod + def add_args(parser): + + AsyncProcessor.add_args(parser) + + parser.add_argument( + '--flow-request-queue', + default=default_flow_request_queue, + help=f'Flow request queue (default: {default_flow_request_queue})' + ) + + parser.add_argument( + '--flow-response-queue', + default=default_flow_response_queue, + help=f'Flow response queue {default_flow_response_queue}', + ) + +def run(): + + Processor.launch(default_ident, __doc__) diff --git a/trustgraph-flow/trustgraph/gateway/config/receiver.py b/trustgraph-flow/trustgraph/gateway/config/receiver.py index 2323cd61..c721a46a 100755 --- a/trustgraph-flow/trustgraph/gateway/config/receiver.py +++ b/trustgraph-flow/trustgraph/gateway/config/receiver.py @@ -54,7 +54,7 @@ class ConfigReceiver: return # Gateway cares about flow config - if notify_types and "flow" not in notify_types and "active-flow" not in notify_types: + if notify_types and "flow" not in notify_types: logger.debug( f"Ignoring config notify v{notify_version}, " f"no flow types in {notify_types}" diff --git a/trustgraph-flow/trustgraph/gateway/dispatch/manager.py b/trustgraph-flow/trustgraph/gateway/dispatch/manager.py index ef3d5507..592120b1 100644 --- a/trustgraph-flow/trustgraph/gateway/dispatch/manager.py +++ b/trustgraph-flow/trustgraph/gateway/dispatch/manager.py @@ -226,7 +226,7 @@ class DispatcherManager: raise RuntimeError("This kind not supported by flow") # FIXME: The -store bit, does it make sense? - qconfig = intf_defs[int_kind] + qconfig = intf_defs[int_kind]["flow"] id = str(uuid.uuid4()) dispatcher = import_dispatchers[kind]( @@ -264,7 +264,7 @@ class DispatcherManager: if int_kind not in intf_defs: raise RuntimeError("This kind not supported by flow") - qconfig = intf_defs[int_kind] + qconfig = intf_defs[int_kind]["flow"] id = str(uuid.uuid4()) dispatcher = export_dispatchers[kind]( @@ -320,7 +320,7 @@ class DispatcherManager: elif kind in sender_dispatchers: dispatcher = sender_dispatchers[kind]( backend = self.backend, - queue = qconfig, + queue = qconfig["flow"], ) else: raise RuntimeError("Invalid kind") diff --git a/trustgraph-flow/trustgraph/librarian/service.py b/trustgraph-flow/trustgraph/librarian/service.py index c735a550..83f97bf3 100755 --- a/trustgraph-flow/trustgraph/librarian/service.py +++ b/trustgraph-flow/trustgraph/librarian/service.py @@ -162,6 +162,9 @@ class Processor(AsyncProcessor): processor = self.id, flow = None, name = "storage-response" ) + self.librarian_request_topic = librarian_request_queue + self.librarian_request_subscriber = id + self.librarian_request_consumer = Consumer( taskgroup = self.taskgroup, backend = self.pubsub, @@ -180,6 +183,9 @@ class Processor(AsyncProcessor): metrics = librarian_response_metrics, ) + self.collection_request_topic = collection_request_queue + self.collection_request_subscriber = id + self.collection_request_consumer = Consumer( taskgroup = self.taskgroup, backend = self.pubsub, @@ -248,7 +254,7 @@ class Processor(AsyncProcessor): self.register_config_handler( self.on_librarian_config, - types=["flow", "active-flow"], + types=["flow"], ) self.flows = {} @@ -257,6 +263,12 @@ class Processor(AsyncProcessor): async def start(self): + await self.pubsub.ensure_queue( + self.librarian_request_topic, self.librarian_request_subscriber + ) + await self.pubsub.ensure_queue( + self.collection_request_topic, self.collection_request_subscriber + ) await super(Processor, self).start() await self.librarian_request_consumer.start() await self.librarian_response_producer.start() @@ -365,12 +377,12 @@ class Processor(AsyncProcessor): else: kind = "document-load" - q = flow["interfaces"][kind] + q = flow["interfaces"][kind]["flow"] # Emit document provenance to knowledge graph if "triples-store" in flow["interfaces"]: await self.emit_document_provenance( - document, processing, flow["interfaces"]["triples-store"] + document, processing, flow["interfaces"]["triples-store"]["flow"] ) if kind == "text-load": From 391b9076f33aa35619d6b40d72edcb54d21b896b Mon Sep 17 00:00:00 2001 From: Het Patel <102606191+CuriousHet@users.noreply.github.com> Date: Fri, 17 Apr 2026 15:59:19 +0530 Subject: [PATCH 2/7] feat: add domain and range validation to triple extraction in extract.py (#825) --- .../test_prompt_and_extraction.py | 46 +++++++++++++++ .../trustgraph/extract/kg/ontology/extract.py | 57 ++++++++++++++++++- 2 files changed, 100 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py b/tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py index 9f9c8551..bae6bdbd 100644 --- a/tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py +++ b/tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py @@ -231,6 +231,52 @@ class TestTripleValidation: is_valid = extractor.is_valid_triple(subject, predicate, object_val, sample_ontology_subset) assert is_valid == expected, f"Validation of {predicate} should be {expected}" + def test_validates_domain_correctly_with_entity_types(self, extractor, sample_ontology_subset): + """Test domain validation correctly compares against extracted entity_types.""" + subject = "my-recipe" + predicate = "produces" + object_val = "my-food" + + # Proper domain for produces is Recipe + entity_types = { + "my-recipe": "Recipe", + "my-food": "Food" + } + + is_valid = extractor.is_valid_triple(subject, predicate, object_val, sample_ontology_subset, entity_types) + assert is_valid, "Valid domain should be accepted" + + # Invalid domain + entity_types_invalid = { + "my-recipe": "Ingredient", + "my-food": "Food" + } + is_invalid = extractor.is_valid_triple(subject, predicate, object_val, sample_ontology_subset, entity_types_invalid) + assert not is_invalid, "Invalid domain should be rejected" + + def test_validates_range_correctly_with_entity_types(self, extractor, sample_ontology_subset): + """Test range validation correctly compares against extracted entity_types.""" + subject = "my-recipe" + predicate = "produces" + object_val = "my-food" + + # Proper range for produces is Food + entity_types = { + "my-recipe": "Recipe", + "my-food": "Food" + } + + is_valid = extractor.is_valid_triple(subject, predicate, object_val, sample_ontology_subset, entity_types) + assert is_valid, "Valid range should be accepted" + + # Invalid range + entity_types_invalid = { + "my-recipe": "Recipe", + "my-food": "Recipe" + } + is_invalid = extractor.is_valid_triple(subject, predicate, object_val, sample_ontology_subset, entity_types_invalid) + assert not is_invalid, "Invalid range should be rejected" + class TestTripleParsing: """Test suite for parsing triples from LLM responses.""" diff --git a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py index bdb0e6e8..e024ad40 100644 --- a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py +++ b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py @@ -429,6 +429,16 @@ class Processor(FlowProcessor): validated_triples = [] ontology_id = ontology_subset.ontology_id + # Gather entity types for domain/range validation + entity_types = {} + for triple_data in triples_response: + if isinstance(triple_data, dict): + s = triple_data.get('subject', '') + p = triple_data.get('predicate', '') + o = triple_data.get('object', '') + if s and p and o and (p == "rdf:type" or p == str(RDF_TYPE)): + entity_types[s] = o + for triple_data in triples_response: try: if isinstance(triple_data, dict): @@ -440,7 +450,7 @@ class Processor(FlowProcessor): continue # Validate against ontology - if self.is_valid_triple(subject, predicate, object_val, ontology_subset): + if self.is_valid_triple(subject, predicate, object_val, ontology_subset, entity_types): # Expand URIs before creating Value objects subject_uri = self.expand_uri(subject, ontology_subset, ontology_id) predicate_uri = self.expand_uri(predicate, ontology_subset, ontology_id) @@ -493,8 +503,11 @@ class Processor(FlowProcessor): return False def is_valid_triple(self, subject: str, predicate: str, object_val: str, - ontology_subset: OntologySubset) -> bool: + ontology_subset: OntologySubset, entity_types: dict = None) -> bool: """Validate triple against ontology constraints.""" + if entity_types is None: + entity_types = {} + # Special case for rdf:type if predicate == "rdf:type" or predicate == str(RDF_TYPE): # Check if object is a valid class @@ -511,7 +524,45 @@ class Processor(FlowProcessor): if not is_obj_prop and not is_dt_prop: return False # Unknown property - # TODO: Add more sophisticated validation (domain/range checking) + prop_def = ontology_subset.object_properties[predicate] if is_obj_prop else ontology_subset.datatype_properties[predicate] + if not isinstance(prop_def, dict): + prop_def = prop_def.__dict__ if hasattr(prop_def, '__dict__') else {} + + # Domain validation + expected_domain = prop_def.get('domain') + if expected_domain and subject in entity_types: + actual_domain = entity_types[subject] + if actual_domain != expected_domain: + is_subclass = False + curr_class = actual_domain + while curr_class in ontology_subset.classes: + cls_def = ontology_subset.classes[curr_class] + parent = cls_def.get('subclass_of') if isinstance(cls_def, dict) else None + if parent == expected_domain: + is_subclass = True + break + curr_class = parent + if not is_subclass: + return False + + # Range validation + if is_obj_prop: + expected_range = prop_def.get('range') + if expected_range and object_val in entity_types: + actual_range = entity_types[object_val] + if actual_range != expected_range: + is_subclass = False + curr_class = actual_range + while curr_class in ontology_subset.classes: + cls_def = ontology_subset.classes[curr_class] + parent = cls_def.get('subclass_of') if isinstance(cls_def, dict) else None + if parent == expected_range: + is_subclass = True + break + curr_class = parent + if not is_subclass: + return False + return True def expand_uri(self, value: str, ontology_subset: OntologySubset, ontology_id: str = "unknown") -> str: From 3505bfdd2521eeccc606e50ff00fa5ed60f9b286 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Fri, 17 Apr 2026 18:01:35 +0100 Subject: [PATCH 3/7] refactor: use one fanout exchange per topic instead of shared topic exchange (#827) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RabbitMQ backend used a single topic exchange per topicspace with routing keys to differentiate logical topics. This meant the flow service had to manually create named queues for every processor-topic pair, including producer-side topics — creating phantom queues that accumulated unread message copies indefinitely. Replace with one fanout exchange per logical topic. Consumers now declare and bind their own queues on connect. The flow service manages topic lifecycle (create/delete exchanges) rather than queue lifecycle, and only collects unique topic identifiers instead of per-processor (topic, subscription) pairs. Backend API: create_queue/delete_queue/ensure_queue replaced with create_topic/delete_topic/ensure_topic (subscription parameter removed). --- .../unit/test_pubsub/test_rabbitmq_backend.py | 47 ++-- trustgraph-base/trustgraph/base/backend.py | 46 ++-- .../trustgraph/base/pulsar_backend.py | 10 +- .../trustgraph/base/rabbitmq_backend.py | 221 ++++++++---------- .../trustgraph/config/service/service.py | 4 +- trustgraph-flow/trustgraph/cores/service.py | 4 +- .../trustgraph/flow/service/flow.py | 72 +++--- .../trustgraph/flow/service/service.py | 6 +- .../trustgraph/librarian/service.py | 8 +- 9 files changed, 190 insertions(+), 228 deletions(-) diff --git a/tests/unit/test_pubsub/test_rabbitmq_backend.py b/tests/unit/test_pubsub/test_rabbitmq_backend.py index ffe18fd7..54599723 100644 --- a/tests/unit/test_pubsub/test_rabbitmq_backend.py +++ b/tests/unit/test_pubsub/test_rabbitmq_backend.py @@ -1,5 +1,5 @@ """ -Unit tests for RabbitMQ backend — queue name mapping and factory dispatch. +Unit tests for RabbitMQ backend — topic parsing and factory dispatch. Does not require a running RabbitMQ instance. """ @@ -12,7 +12,7 @@ from trustgraph.base.rabbitmq_backend import RabbitMQBackend from trustgraph.base.pubsub import get_pubsub, add_pubsub_args -class TestRabbitMQMapQueueName: +class TestRabbitMQParseTopic: @pytest.fixture def backend(self): @@ -20,43 +20,48 @@ class TestRabbitMQMapQueueName: return b def test_flow_is_durable(self, backend): - name, durable = backend.map_queue_name('flow:tg:text-completion-request') + exchange, cls, durable = backend._parse_topic('flow:tg:text-completion-request') assert durable is True - assert name == 'tg.flow.text-completion-request' + assert cls == 'flow' + assert exchange == 'tg.flow.text-completion-request' def test_notify_is_not_durable(self, backend): - name, durable = backend.map_queue_name('notify:tg:config') + exchange, cls, durable = backend._parse_topic('notify:tg:config') assert durable is False - assert name == 'tg.notify.config' + assert cls == 'notify' + assert exchange == 'tg.notify.config' def test_request_is_not_durable(self, backend): - name, durable = backend.map_queue_name('request:tg:config') + exchange, cls, durable = backend._parse_topic('request:tg:config') assert durable is False - assert name == 'tg.request.config' + assert cls == 'request' + assert exchange == 'tg.request.config' def test_response_is_not_durable(self, backend): - name, durable = backend.map_queue_name('response:tg:librarian') + exchange, cls, durable = backend._parse_topic('response:tg:librarian') assert durable is False - assert name == 'tg.response.librarian' + assert cls == 'response' + assert exchange == 'tg.response.librarian' def test_custom_topicspace(self, backend): - name, durable = backend.map_queue_name('flow:prod:my-queue') - assert name == 'prod.flow.my-queue' + exchange, cls, durable = backend._parse_topic('flow:prod:my-queue') + assert exchange == 'prod.flow.my-queue' assert durable is True def test_no_colon_defaults_to_flow(self, backend): - name, durable = backend.map_queue_name('simple-queue') - assert name == 'tg.simple-queue' - assert durable is False + exchange, cls, durable = backend._parse_topic('simple-queue') + assert exchange == 'tg.flow.simple-queue' + assert cls == 'flow' + assert durable is True def test_invalid_class_raises(self, backend): - with pytest.raises(ValueError, match="Invalid queue class"): - backend.map_queue_name('unknown:tg:topic') + with pytest.raises(ValueError, match="Invalid topic class"): + backend._parse_topic('unknown:tg:topic') - def test_flow_with_flow_suffix(self, backend): - """Queue names with flow suffix (e.g. :default) are preserved.""" - name, durable = backend.map_queue_name('request:tg:prompt:default') - assert name == 'tg.request.prompt:default' + def test_topic_with_flow_suffix(self, backend): + """Topic names with flow suffix (e.g. :default) are preserved.""" + exchange, cls, durable = backend._parse_topic('request:tg:prompt:default') + assert exchange == 'tg.request.prompt:default' class TestGetPubsubRabbitMQ: diff --git a/trustgraph-base/trustgraph/base/backend.py b/trustgraph-base/trustgraph/base/backend.py index 0f95ca1b..a105ca17 100644 --- a/trustgraph-base/trustgraph/base/backend.py +++ b/trustgraph-base/trustgraph/base/backend.py @@ -121,7 +121,7 @@ class PubSubBackend(Protocol): Create a producer for a topic. Args: - topic: Generic topic format (qos/tenant/namespace/queue) + topic: Queue identifier in class:topicspace:topic format schema: Dataclass type for messages **options: Backend-specific options (e.g., chunking_enabled) @@ -159,59 +159,55 @@ class PubSubBackend(Protocol): """ ... - async def create_queue(self, topic: str, subscription: str) -> None: + async def create_topic(self, topic: str) -> None: """ - Pre-create a queue so it exists before any consumer connects. + Create the broker-side resources for a logical topic. - The topic and subscription together identify the queue, mirroring - create_consumer where the queue name is derived from both. + For RabbitMQ this creates a fanout exchange. For Pulsar this is + a no-op (topics auto-create on first use). - Idempotent — creating an already-existing queue succeeds silently. + Idempotent — creating an already-existing topic succeeds silently. Args: - topic: Queue identifier in class:topicspace:topic format - subscription: Subscription/consumer group name + topic: Topic identifier in class:topicspace:topic format """ ... - async def delete_queue(self, topic: str, subscription: str) -> None: + async def delete_topic(self, topic: str) -> None: """ - Delete a queue and any messages it contains. + Delete a topic and discard any in-flight messages. - The topic and subscription together identify the queue, mirroring - create_consumer where the queue name is derived from both. + For RabbitMQ this deletes the fanout exchange; consumer queues + lose their binding and drain naturally. - Idempotent — deleting a non-existent queue succeeds silently. + Idempotent — deleting a non-existent topic succeeds silently. Args: - topic: Queue identifier in class:topicspace:topic format - subscription: Subscription/consumer group name + topic: Topic identifier in class:topicspace:topic format """ ... - async def queue_exists(self, topic: str, subscription: str) -> bool: + async def topic_exists(self, topic: str) -> bool: """ - Check whether a queue exists. + Check whether a topic exists. Args: - topic: Queue identifier in class:topicspace:topic format - subscription: Subscription/consumer group name + topic: Topic identifier in class:topicspace:topic format Returns: - True if the queue exists, False otherwise. + True if the topic exists, False otherwise. """ ... - async def ensure_queue(self, topic: str, subscription: str) -> None: + async def ensure_topic(self, topic: str) -> None: """ - Ensure a queue exists, creating it if necessary. + Ensure a topic exists, creating it if necessary. Convenience wrapper — checks existence, creates if missing. - Used by system services on startup. + Used by the flow service and system services on startup. Args: - topic: Queue identifier in class:topicspace:topic format - subscription: Subscription/consumer group name + topic: Topic identifier in class:topicspace:topic format """ ... diff --git a/trustgraph-base/trustgraph/base/pulsar_backend.py b/trustgraph-base/trustgraph/base/pulsar_backend.py index 2100483d..e27d16af 100644 --- a/trustgraph-base/trustgraph/base/pulsar_backend.py +++ b/trustgraph-base/trustgraph/base/pulsar_backend.py @@ -266,22 +266,22 @@ class PulsarBackend: return PulsarBackendConsumer(pulsar_consumer, schema) - async def create_queue(self, topic: str, subscription: str) -> None: + async def create_topic(self, topic: str) -> None: """No-op — Pulsar auto-creates topics on first use. TODO: Use admin REST API for explicit persistent topic creation.""" pass - async def delete_queue(self, topic: str, subscription: str) -> None: + async def delete_topic(self, topic: str) -> None: """No-op — to be replaced with admin REST API calls. - TODO: Delete subscription and persistent topic via admin API.""" + TODO: Delete persistent topic via admin API.""" pass - async def queue_exists(self, topic: str, subscription: str) -> bool: + async def topic_exists(self, topic: str) -> bool: """Returns True — Pulsar auto-creates on subscribe. TODO: Use admin REST API for actual existence check.""" return True - async def ensure_queue(self, topic: str, subscription: str) -> None: + async def ensure_topic(self, topic: str) -> None: """No-op — Pulsar auto-creates topics on first use. TODO: Use admin REST API for explicit creation.""" pass diff --git a/trustgraph-base/trustgraph/base/rabbitmq_backend.py b/trustgraph-base/trustgraph/base/rabbitmq_backend.py index 43c717c3..73b80cb9 100644 --- a/trustgraph-base/trustgraph/base/rabbitmq_backend.py +++ b/trustgraph-base/trustgraph/base/rabbitmq_backend.py @@ -1,22 +1,24 @@ """ RabbitMQ backend implementation for pub/sub abstraction. -Uses a single topic exchange per topicspace. The logical queue name -becomes the routing key. Consumer behavior is determined by the -subscription name: +Each logical topic maps to its own fanout exchange. The exchange name +encodes the full topic identity: -- Same subscription + same topic = shared queue (competing consumers) -- Different subscriptions = separate queues (broadcast / fan-out) + class:topicspace:topic → exchange topicspace.class.topic -This mirrors Pulsar's subscription model using idiomatic RabbitMQ. +Producers publish to the exchange with an empty routing key. +Consumers declare and bind their own queues: + + - flow / request: named durable/non-durable queue (competing consumers) + - response / notify: anonymous exclusive auto-delete queue (per-subscriber) + +The flow service manages topic lifecycle (create/delete exchanges). +Consumers manage their own queue lifecycle (declare + bind on connect). Architecture: - Producer --> [tg exchange] --routing key--> [named queue] --> Consumer - --routing key--> [named queue] --> Consumer - --routing key--> [exclusive q] --> Subscriber - -Uses basic_consume (push) instead of basic_get (polling) for -efficient message delivery. + Producer --> [fanout exchange] --> [named queue] --> Consumer + --> [named queue] --> Consumer + --> [exclusive queue] --> Subscriber """ import asyncio @@ -58,18 +60,16 @@ class RabbitMQMessage: class RabbitMQBackendProducer: - """Publishes messages to a topic exchange with a routing key. + """Publishes messages to a fanout exchange. Uses thread-local connections so each thread gets its own connection/channel. This avoids wire corruption from concurrent threads writing to the same socket (pika is not thread-safe). """ - def __init__(self, connection_params, exchange_name, routing_key, - durable): + def __init__(self, connection_params, exchange_name, durable): self._connection_params = connection_params self._exchange_name = exchange_name - self._routing_key = routing_key self._durable = durable self._local = threading.local() @@ -90,7 +90,7 @@ class RabbitMQBackendProducer: chan = conn.channel() chan.exchange_declare( exchange=self._exchange_name, - exchange_type='topic', + exchange_type='fanout', durable=True, ) self._local.connection = conn @@ -113,7 +113,7 @@ class RabbitMQBackendProducer: channel = self._get_channel() channel.basic_publish( exchange=self._exchange_name, - routing_key=self._routing_key, + routing_key='', body=json_data.encode('utf-8'), properties=amqp_properties, ) @@ -144,19 +144,17 @@ class RabbitMQBackendProducer: class RabbitMQBackendConsumer: - """Consumes from a queue bound to a topic exchange. + """Consumes from a queue bound to a fanout exchange. Uses basic_consume (push model) with messages delivered to an internal thread-safe queue. process_data_events() drives both message delivery and heartbeat processing. """ - def __init__(self, connection_params, exchange_name, routing_key, - queue_name, schema_cls, durable, exclusive=False, - auto_delete=False): + def __init__(self, connection_params, exchange_name, queue_name, + schema_cls, durable, exclusive=False, auto_delete=False): self._connection_params = connection_params self._exchange_name = exchange_name - self._routing_key = routing_key self._queue_name = queue_name self._schema_cls = schema_cls self._durable = durable @@ -171,17 +169,16 @@ class RabbitMQBackendConsumer: self._connection = pika.BlockingConnection(self._connection_params) self._channel = self._connection.channel() - # Declare the topic exchange (idempotent, also done by producers) + # Declare the fanout exchange (idempotent) self._channel.exchange_declare( exchange=self._exchange_name, - exchange_type='topic', + exchange_type='fanout', durable=True, ) if self._exclusive: # Anonymous ephemeral queue (response/notify class). - # These are per-consumer and must be created here — the - # broker assigns the name. + # Per-consumer, broker assigns the name. result = self._channel.queue_declare( queue='', durable=False, @@ -189,20 +186,22 @@ class RabbitMQBackendConsumer: auto_delete=True, ) self._queue_name = result.method.queue - - self._channel.queue_bind( - queue=self._queue_name, - exchange=self._exchange_name, - routing_key=self._routing_key, - ) else: - # Named queue (flow/request class). Queue must already - # exist — created by the flow service or ensure_queue. - # We just verify it exists and bind to consume. + # Named queue (flow/request class). + # Consumer owns its queue — declare and bind here. self._channel.queue_declare( - queue=self._queue_name, passive=True, + queue=self._queue_name, + durable=self._durable, + exclusive=False, + auto_delete=False, ) + # Bind queue to the fanout exchange + self._channel.queue_bind( + queue=self._queue_name, + exchange=self._exchange_name, + ) + self._channel.basic_qos(prefetch_count=1) # Register push-based consumer @@ -318,7 +317,7 @@ class RabbitMQBackendConsumer: class RabbitMQBackend: - """RabbitMQ pub/sub backend using a topic exchange per topicspace.""" + """RabbitMQ pub/sub backend using one fanout exchange per topic.""" def __init__(self, host='localhost', port=5672, username='guest', password='guest', vhost='/'): @@ -331,20 +330,23 @@ class RabbitMQBackend: ) logger.info(f"RabbitMQ backend: {host}:{port} vhost={vhost}") - def _parse_queue_id(self, queue_id: str) -> tuple[str, str, str, bool]: + def _parse_topic(self, topic_id: str) -> tuple[str, str, bool]: """ - Parse queue identifier into exchange, routing key, and durability. + Parse topic identifier into exchange name and durability. Format: class:topicspace:topic - Returns: (exchange_name, routing_key, class, durable) - """ - if ':' not in queue_id: - return 'tg', queue_id, 'flow', False + Returns: (exchange_name, class, durable) - parts = queue_id.split(':', 2) + The exchange name encodes the full topic identity: + class:topicspace:topic → topicspace.class.topic + """ + if ':' not in topic_id: + return f'tg.flow.{topic_id}', 'flow', True + + parts = topic_id.split(':', 2) if len(parts) != 3: raise ValueError( - f"Invalid queue format: {queue_id}, " + f"Invalid topic format: {topic_id}, " f"expected class:topicspace:topic" ) @@ -356,36 +358,28 @@ class RabbitMQBackend: durable = False else: raise ValueError( - f"Invalid queue class: {cls}, " + f"Invalid topic class: {cls}, " f"expected flow, request, response, or notify" ) - # Exchange per topicspace, routing key includes class - exchange_name = topicspace - routing_key = f"{cls}.{topic}" + exchange_name = f"{topicspace}.{cls}.{topic}" - return exchange_name, routing_key, cls, durable - - # Keep map_queue_name for backward compatibility with tests - def map_queue_name(self, queue_id: str) -> tuple[str, bool]: - exchange, routing_key, cls, durable = self._parse_queue_id(queue_id) - return f"{exchange}.{routing_key}", durable + return exchange_name, cls, durable def create_producer(self, topic: str, schema: type, **options) -> BackendProducer: - exchange, routing_key, cls, durable = self._parse_queue_id(topic) + exchange, cls, durable = self._parse_topic(topic) logger.debug( - f"Creating producer: exchange={exchange}, " - f"routing_key={routing_key}" + f"Creating producer: exchange={exchange}" ) return RabbitMQBackendProducer( - self._connection_params, exchange, routing_key, durable, + self._connection_params, exchange, durable, ) def create_consumer(self, topic: str, subscription: str, schema: type, initial_position: str = 'latest', **options) -> BackendConsumer: - """Create a consumer with a queue bound to the topic exchange. + """Create a consumer with a queue bound to the topic's exchange. Behaviour is determined by the topic's class prefix: - flow: named durable queue, competing consumers (round-robin) @@ -393,7 +387,7 @@ class RabbitMQBackend: - response: anonymous ephemeral queue, per-subscriber (auto-delete) - notify: anonymous ephemeral queue, per-subscriber (auto-delete) """ - exchange, routing_key, cls, durable = self._parse_queue_id(topic) + exchange, cls, durable = self._parse_topic(topic) if cls in ('response', 'notify'): # Per-subscriber: anonymous queue, auto-deleted on disconnect @@ -403,45 +397,33 @@ class RabbitMQBackend: auto_delete = True else: # Shared: named queue, competing consumers - queue_name = f"{exchange}.{routing_key}.{subscription}" + queue_name = f"{exchange}.{subscription}" queue_durable = durable exclusive = False auto_delete = False logger.debug( f"Creating consumer: exchange={exchange}, " - f"routing_key={routing_key}, queue={queue_name or '(anonymous)'}, " - f"cls={cls}" + f"queue={queue_name or '(anonymous)'}, cls={cls}" ) return RabbitMQBackendConsumer( - self._connection_params, exchange, routing_key, + self._connection_params, exchange, queue_name, schema, queue_durable, exclusive, auto_delete, ) - def _create_queue_sync(self, exchange, routing_key, queue_name, durable): - """Blocking queue creation — run via asyncio.to_thread.""" + def _create_topic_sync(self, exchange_name): + """Blocking exchange creation — run via asyncio.to_thread.""" connection = None try: connection = pika.BlockingConnection(self._connection_params) channel = connection.channel() channel.exchange_declare( - exchange=exchange, - exchange_type='topic', + exchange=exchange_name, + exchange_type='fanout', durable=True, ) - channel.queue_declare( - queue=queue_name, - durable=durable, - exclusive=False, - auto_delete=False, - ) - channel.queue_bind( - queue=queue_name, - exchange=exchange, - routing_key=routing_key, - ) - logger.info(f"Created queue: {queue_name}") + logger.info(f"Created topic (exchange): {exchange_name}") finally: if connection and connection.is_open: try: @@ -449,34 +431,30 @@ class RabbitMQBackend: except Exception: pass - async def create_queue(self, topic: str, subscription: str) -> None: - """Pre-create a named queue bound to the topic exchange. + async def create_topic(self, topic: str) -> None: + """Create the fanout exchange for a logical topic. - Only applies to shared queues (flow/request class). Response and - notify queues are anonymous/auto-delete and created by consumers. + Only applies to flow and request class topics. Response and + notify exchanges are created on demand by consumers. """ - exchange, routing_key, cls, durable = self._parse_queue_id(topic) + exchange, cls, durable = self._parse_topic(topic) if cls in ('response', 'notify'): return - queue_name = f"{exchange}.{routing_key}.{subscription}" - await asyncio.to_thread( - self._create_queue_sync, exchange, routing_key, - queue_name, durable, - ) + await asyncio.to_thread(self._create_topic_sync, exchange) - def _delete_queue_sync(self, queue_name): - """Blocking queue deletion — run via asyncio.to_thread.""" + def _delete_topic_sync(self, exchange_name): + """Blocking exchange deletion — run via asyncio.to_thread.""" connection = None try: connection = pika.BlockingConnection(self._connection_params) channel = connection.channel() - channel.queue_delete(queue=queue_name) - logger.info(f"Deleted queue: {queue_name}") + channel.exchange_delete(exchange=exchange_name) + logger.info(f"Deleted topic (exchange): {exchange_name}") except Exception as e: - # Idempotent — queue may already be gone - logger.debug(f"Queue delete for {queue_name}: {e}") + # Idempotent — exchange may already be gone + logger.debug(f"Exchange delete for {exchange_name}: {e}") finally: if connection and connection.is_open: try: @@ -484,31 +462,27 @@ class RabbitMQBackend: except Exception: pass - async def delete_queue(self, topic: str, subscription: str) -> None: - """Delete a named queue and any messages it contains. + async def delete_topic(self, topic: str) -> None: + """Delete a topic's fanout exchange. - Only applies to shared queues (flow/request class). Response and - notify queues are anonymous/auto-delete and managed by the broker. + Consumer queues lose their binding and drain naturally. """ - exchange, routing_key, cls, durable = self._parse_queue_id(topic) + exchange, cls, durable = self._parse_topic(topic) + await asyncio.to_thread(self._delete_topic_sync, exchange) - if cls in ('response', 'notify'): - return - - queue_name = f"{exchange}.{routing_key}.{subscription}" - await asyncio.to_thread(self._delete_queue_sync, queue_name) - - def _queue_exists_sync(self, queue_name): - """Blocking queue existence check — run via asyncio.to_thread. + def _topic_exists_sync(self, exchange_name): + """Blocking exchange existence check — run via asyncio.to_thread. Uses passive=True which checks without creating.""" connection = None try: connection = pika.BlockingConnection(self._connection_params) channel = connection.channel() - channel.queue_declare(queue=queue_name, passive=True) + channel.exchange_declare( + exchange=exchange_name, passive=True, + ) return True except pika.exceptions.ChannelClosedByBroker: - # 404 NOT_FOUND — queue does not exist + # 404 NOT_FOUND — exchange does not exist return False finally: if connection and connection.is_open: @@ -517,26 +491,25 @@ class RabbitMQBackend: except Exception: pass - async def queue_exists(self, topic: str, subscription: str) -> bool: - """Check whether a named queue exists. + async def topic_exists(self, topic: str) -> bool: + """Check whether a topic's exchange exists. - Only applies to shared queues (flow/request class). Response and - notify queues are anonymous/ephemeral — always returns False. + Only applies to flow and request class topics. Response and + notify topics are ephemeral — always returns False. """ - exchange, routing_key, cls, durable = self._parse_queue_id(topic) + exchange, cls, durable = self._parse_topic(topic) if cls in ('response', 'notify'): return False - queue_name = f"{exchange}.{routing_key}.{subscription}" return await asyncio.to_thread( - self._queue_exists_sync, queue_name + self._topic_exists_sync, exchange ) - async def ensure_queue(self, topic: str, subscription: str) -> None: - """Ensure a queue exists, creating it if necessary.""" - if not await self.queue_exists(topic, subscription): - await self.create_queue(topic, subscription) + async def ensure_topic(self, topic: str) -> None: + """Ensure a topic exists, creating it if necessary.""" + if not await self.topic_exists(topic): + await self.create_topic(topic) def close(self) -> None: pass diff --git a/trustgraph-flow/trustgraph/config/service/service.py b/trustgraph-flow/trustgraph/config/service/service.py index 75232315..fe44b852 100644 --- a/trustgraph-flow/trustgraph/config/service/service.py +++ b/trustgraph-flow/trustgraph/config/service/service.py @@ -124,9 +124,7 @@ class Processor(AsyncProcessor): async def start(self): - await self.pubsub.ensure_queue( - self.config_request_topic, self.config_request_subscriber - ) + await self.pubsub.ensure_topic(self.config_request_topic) await self.push() # Startup poke: empty types = everything await self.config_request_consumer.start() diff --git a/trustgraph-flow/trustgraph/cores/service.py b/trustgraph-flow/trustgraph/cores/service.py index 400f96d1..93017c30 100755 --- a/trustgraph-flow/trustgraph/cores/service.py +++ b/trustgraph-flow/trustgraph/cores/service.py @@ -119,9 +119,7 @@ class Processor(AsyncProcessor): async def start(self): - await self.pubsub.ensure_queue( - self.knowledge_request_topic, self.knowledge_request_subscriber - ) + await self.pubsub.ensure_topic(self.knowledge_request_topic) await super(Processor, self).start() await self.knowledge_request_consumer.start() await self.knowledge_response_producer.start() diff --git a/trustgraph-flow/trustgraph/flow/service/flow.py b/trustgraph-flow/trustgraph/flow/service/flow.py index 477c6a2c..b864faf9 100644 --- a/trustgraph-flow/trustgraph/flow/service/flow.py +++ b/trustgraph-flow/trustgraph/flow/service/flow.py @@ -7,7 +7,7 @@ import logging # Module logger logger = logging.getLogger(__name__) -# Queue deletion retry settings +# Topic deletion retry settings DELETE_RETRIES = 5 DELETE_RETRY_DELAY = 2 # seconds @@ -215,11 +215,11 @@ class FlowConfig: return result - # Pre-create flow-level queues so the data path is wired + # Pre-create topic exchanges so the data path is wired # before processors receive their config and start connecting. - queues = self._collect_flow_queues(cls, repl_template_with_params) - for topic, subscription in queues: - await self.pubsub.create_queue(topic, subscription) + topics = self._collect_flow_topics(cls, repl_template_with_params) + for topic in topics: + await self.pubsub.create_topic(topic) # Build all processor config updates, then write in a single batch. updates = [] @@ -283,8 +283,8 @@ class FlowConfig: error = None, ) - async def ensure_existing_flow_queues(self): - """Ensure queues exist for all already-running flows. + async def ensure_existing_flow_topics(self): + """Ensure topics exist for all already-running flows. Called on startup to handle flows that were started before this version of the flow service was deployed, or before a restart. @@ -315,7 +315,7 @@ class FlowConfig: if blueprint_data is None: logger.warning( f"Blueprint '{blueprint_name}' not found for " - f"flow '{flow_id}', skipping queue creation" + f"flow '{flow_id}', skipping topic creation" ) continue @@ -333,65 +333,63 @@ class FlowConfig: ) return result - queues = self._collect_flow_queues(cls, repl_template) - for topic, subscription in queues: - await self.pubsub.ensure_queue(topic, subscription) + topics = self._collect_flow_topics(cls, repl_template) + for topic in topics: + await self.pubsub.ensure_topic(topic) logger.info( - f"Ensured queues for existing flow '{flow_id}'" + f"Ensured topics for existing flow '{flow_id}'" ) except Exception as e: logger.error( - f"Failed to ensure queues for flow '{flow_id}': {e}" + f"Failed to ensure topics for flow '{flow_id}': {e}" ) - def _collect_flow_queues(self, cls, repl_template): - """Collect (topic, subscription) pairs for all flow-level queues. + def _collect_flow_topics(self, cls, repl_template): + """Collect unique topic identifiers from the blueprint. - Iterates the blueprint's "flow" section and reads only the - "topics" dict from each processor entry. + Iterates the blueprint's "flow" section and returns a + deduplicated set of resolved topic strings. The flow service + manages topic lifecycle (create/delete exchanges), not + individual consumer queues. """ - queues = [] + topics = set() for k, v in cls["flow"].items(): - processor, variant = k.split(":", 1) - variant = repl_template(variant) - for spec_name, topic_template in v.get("topics", {}).items(): topic = repl_template(topic_template) - subscription = f"{processor}--{variant}--{spec_name}" - queues.append((topic, subscription)) + topics.add(topic) - return queues + return topics - async def _delete_queues(self, queues): - """Delete queues with retries. Best-effort — logs failures but + async def _delete_topics(self, topics): + """Delete topics with retries. Best-effort — logs failures but does not raise.""" for attempt in range(DELETE_RETRIES): remaining = [] - for topic, subscription in queues: + for topic in topics: try: - await self.pubsub.delete_queue(topic, subscription) + await self.pubsub.delete_topic(topic) except Exception as e: logger.warning( - f"Queue delete failed (attempt {attempt + 1}/" + f"Topic delete failed (attempt {attempt + 1}/" f"{DELETE_RETRIES}): {topic}: {e}" ) - remaining.append((topic, subscription)) + remaining.append(topic) if not remaining: return - queues = remaining + topics = remaining if attempt < DELETE_RETRIES - 1: await asyncio.sleep(DELETE_RETRY_DELAY) - for topic, subscription in queues: + for topic in topics: logger.error( - f"Failed to delete queue after {DELETE_RETRIES} " + f"Failed to delete topic after {DELETE_RETRIES} " f"attempts: {topic}" ) @@ -426,8 +424,8 @@ class FlowConfig: result = result.replace(f"{{{param_name}}}", str(param_value)) return result - # Collect queue identifiers before removing config - queues = self._collect_flow_queues(cls, repl_template) + # Collect topic identifiers before removing config + topics = self._collect_flow_topics(cls, repl_template) # Phase 1: Set status to "stopping" and remove processor config. # The config push tells processors to shut down their consumers. @@ -448,8 +446,8 @@ class FlowConfig: await self.config.delete_many(deletes) - # Phase 2: Delete queues with retries, then remove the flow record. - await self._delete_queues(queues) + # Phase 2: Delete topics with retries, then remove the flow record. + await self._delete_topics(topics) if msg.flow_id in await self.config.keys("flow"): await self.config.delete("flow", msg.flow_id) diff --git a/trustgraph-flow/trustgraph/flow/service/service.py b/trustgraph-flow/trustgraph/flow/service/service.py index a3f2fb6b..e1997452 100644 --- a/trustgraph-flow/trustgraph/flow/service/service.py +++ b/trustgraph-flow/trustgraph/flow/service/service.py @@ -101,11 +101,9 @@ class Processor(AsyncProcessor): async def start(self): - await self.pubsub.ensure_queue( - self.flow_request_topic, self.flow_request_subscriber - ) + await self.pubsub.ensure_topic(self.flow_request_topic) await self.config_client.start() - await self.flow.ensure_existing_flow_queues() + await self.flow.ensure_existing_flow_topics() await self.flow_request_consumer.start() async def on_flow_request(self, msg, consumer, flow): diff --git a/trustgraph-flow/trustgraph/librarian/service.py b/trustgraph-flow/trustgraph/librarian/service.py index 83f97bf3..ed005298 100755 --- a/trustgraph-flow/trustgraph/librarian/service.py +++ b/trustgraph-flow/trustgraph/librarian/service.py @@ -263,12 +263,8 @@ class Processor(AsyncProcessor): async def start(self): - await self.pubsub.ensure_queue( - self.librarian_request_topic, self.librarian_request_subscriber - ) - await self.pubsub.ensure_queue( - self.collection_request_topic, self.collection_request_subscriber - ) + await self.pubsub.ensure_topic(self.librarian_request_topic) + await self.pubsub.ensure_topic(self.collection_request_topic) await super(Processor, self).start() await self.librarian_request_consumer.start() await self.librarian_response_producer.start() From 81cde7baf9670135f220c5fe31807722aa70b9ec Mon Sep 17 00:00:00 2001 From: Syed Ishmum Ahnaf Date: Sat, 18 Apr 2026 16:14:52 +0600 Subject: [PATCH 4/7] fix for issue #821: deferring optional SDK imports to runtime for provider modules (#828) --- .../trustgraph/embeddings/hf/hf.py | 3 +- .../trustgraph/chunking/recursive/chunker.py | 8 ++-- .../trustgraph/chunking/token/chunker.py | 8 ++-- .../trustgraph/decoding/pdf/pdf_decoder.py | 4 +- .../text_completion/googleaistudio/llm.py | 47 +++++++++++-------- 5 files changed, 39 insertions(+), 31 deletions(-) diff --git a/trustgraph-embeddings-hf/trustgraph/embeddings/hf/hf.py b/trustgraph-embeddings-hf/trustgraph/embeddings/hf/hf.py index 8c4d571b..435d3ba9 100755 --- a/trustgraph-embeddings-hf/trustgraph/embeddings/hf/hf.py +++ b/trustgraph-embeddings-hf/trustgraph/embeddings/hf/hf.py @@ -7,8 +7,6 @@ Input is text, output is embeddings vector. import logging from ... base import EmbeddingsService -from langchain_huggingface import HuggingFaceEmbeddings - # Module logger logger = logging.getLogger(__name__) @@ -38,6 +36,7 @@ class Processor(EmbeddingsService): def _load_model(self, model_name): """Load a model, caching it for reuse""" if self.cached_model_name != model_name: + from langchain_huggingface import HuggingFaceEmbeddings logger.info(f"Loading HuggingFace embeddings model: {model_name}") self.embeddings = HuggingFaceEmbeddings(model_name=model_name) self.cached_model_name = model_name diff --git a/trustgraph-flow/trustgraph/chunking/recursive/chunker.py b/trustgraph-flow/trustgraph/chunking/recursive/chunker.py index df2c58bd..257edc8c 100755 --- a/trustgraph-flow/trustgraph/chunking/recursive/chunker.py +++ b/trustgraph-flow/trustgraph/chunking/recursive/chunker.py @@ -5,7 +5,6 @@ as text as separate output objects. """ import logging -from langchain_text_splitters import RecursiveCharacterTextSplitter from prometheus_client import Histogram from ... schema import TextDocument, Chunk, Metadata, Triples @@ -42,6 +41,9 @@ class Processor(ChunkingService): self.default_chunk_size = chunk_size self.default_chunk_overlap = chunk_overlap + from langchain_text_splitters import RecursiveCharacterTextSplitter + self.RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter + if not hasattr(__class__, "chunk_metric"): __class__.chunk_metric = Histogram( 'chunk_size', 'Chunk size', @@ -50,7 +52,7 @@ class Processor(ChunkingService): 2500, 4000, 6400, 10000, 16000] ) - self.text_splitter = RecursiveCharacterTextSplitter( + self.text_splitter = self.RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, @@ -103,7 +105,7 @@ class Processor(ChunkingService): chunk_overlap = int(chunk_overlap) # Create text splitter with effective parameters - text_splitter = RecursiveCharacterTextSplitter( + text_splitter = self.RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, diff --git a/trustgraph-flow/trustgraph/chunking/token/chunker.py b/trustgraph-flow/trustgraph/chunking/token/chunker.py index 3e1161bf..d315c428 100755 --- a/trustgraph-flow/trustgraph/chunking/token/chunker.py +++ b/trustgraph-flow/trustgraph/chunking/token/chunker.py @@ -5,7 +5,6 @@ as text as separate output objects. """ import logging -from langchain_text_splitters import TokenTextSplitter from prometheus_client import Histogram from ... schema import TextDocument, Chunk, Metadata, Triples @@ -42,6 +41,9 @@ class Processor(ChunkingService): self.default_chunk_size = chunk_size self.default_chunk_overlap = chunk_overlap + from langchain_text_splitters import TokenTextSplitter + self.TokenTextSplitter = TokenTextSplitter + if not hasattr(__class__, "chunk_metric"): __class__.chunk_metric = Histogram( 'chunk_size', 'Chunk size', @@ -50,7 +52,7 @@ class Processor(ChunkingService): 2500, 4000, 6400, 10000, 16000] ) - self.text_splitter = TokenTextSplitter( + self.text_splitter = self.TokenTextSplitter( encoding_name="cl100k_base", chunk_size=chunk_size, chunk_overlap=chunk_overlap, @@ -102,7 +104,7 @@ class Processor(ChunkingService): chunk_overlap = int(chunk_overlap) # Create text splitter with effective parameters - text_splitter = TokenTextSplitter( + text_splitter = self.TokenTextSplitter( encoding_name="cl100k_base", chunk_size=chunk_size, chunk_overlap=chunk_overlap, diff --git a/trustgraph-flow/trustgraph/decoding/pdf/pdf_decoder.py b/trustgraph-flow/trustgraph/decoding/pdf/pdf_decoder.py index d0061afd..ab31b717 100755 --- a/trustgraph-flow/trustgraph/decoding/pdf/pdf_decoder.py +++ b/trustgraph-flow/trustgraph/decoding/pdf/pdf_decoder.py @@ -11,13 +11,10 @@ import os import tempfile import base64 import logging -from langchain_community.document_loaders import PyPDFLoader - from ... schema import Document, TextDocument, Metadata from ... schema import librarian_request_queue, librarian_response_queue from ... schema import Triples from ... base import FlowProcessor, ConsumerSpec, ProducerSpec, LibrarianClient - from ... provenance import ( document_uri, page_uri as make_page_uri, derived_entity_triples, set_graph, GRAPH_SOURCE, @@ -131,6 +128,7 @@ class Processor(FlowProcessor): fp.write(base64.b64decode(v.data)) fp.close() + from langchain_community.document_loaders import PyPDFLoader loader = PyPDFLoader(temp_path) pages = loader.load() diff --git a/trustgraph-vertexai/trustgraph/model/text_completion/googleaistudio/llm.py b/trustgraph-vertexai/trustgraph/model/text_completion/googleaistudio/llm.py index d241d6f2..9f431d20 100644 --- a/trustgraph-vertexai/trustgraph/model/text_completion/googleaistudio/llm.py +++ b/trustgraph-vertexai/trustgraph/model/text_completion/googleaistudio/llm.py @@ -12,11 +12,6 @@ Input is prompt, output is response. # TrustGraph implements in the trustgraph-vertexai package. # -from google import genai -from google.genai import types -from google.genai.types import HarmCategory, HarmBlockThreshold -from google.genai.errors import ClientError -from google.api_core.exceptions import ResourceExhausted import os import logging @@ -42,6 +37,18 @@ class Processor(LlmService): temperature = params.get("temperature", default_temperature) max_output = params.get("max_output", default_max_output) + from google import genai + from google.genai import types + from google.genai.types import HarmCategory, HarmBlockThreshold + from google.genai.errors import ClientError + from google.api_core.exceptions import ResourceExhausted + self.genai = genai + self.types = types + self.HarmCategory = HarmCategory + self.HarmBlockThreshold = HarmBlockThreshold + self.ClientError = ClientError + self.ResourceExhausted = ResourceExhausted + if api_key is None: raise RuntimeError("Google AI Studio API key not specified") @@ -53,7 +60,7 @@ class Processor(LlmService): } ) - self.client = genai.Client(api_key=api_key, vertexai=False) + self.client = self.genai.Client(api_key=api_key, vertexai=False) self.default_model = model self.temperature = temperature self.max_output = max_output @@ -61,23 +68,23 @@ class Processor(LlmService): # Cache for generation configs per model self.generation_configs = {} - block_level = HarmBlockThreshold.BLOCK_ONLY_HIGH + block_level = self.HarmBlockThreshold.BLOCK_ONLY_HIGH self.safety_settings = [ - types.SafetySetting( - category = HarmCategory.HARM_CATEGORY_HATE_SPEECH, + self.types.SafetySetting( + category = self.HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold = block_level, ), - types.SafetySetting( - category = HarmCategory.HARM_CATEGORY_HARASSMENT, + self.types.SafetySetting( + category = self.HarmCategory.HARM_CATEGORY_HARASSMENT, threshold = block_level, ), - types.SafetySetting( - category = HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, + self.types.SafetySetting( + category = self.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, threshold = block_level, ), - types.SafetySetting( - category = HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, + self.types.SafetySetting( + category = self.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold = block_level, ), # There is a documentation conflict on whether or not @@ -97,7 +104,7 @@ class Processor(LlmService): if cache_key not in self.generation_configs: logger.info(f"Creating generation config for '{model_name}' with temperature {effective_temperature}") - self.generation_configs[cache_key] = types.GenerateContentConfig( + self.generation_configs[cache_key] = self.types.GenerateContentConfig( temperature = effective_temperature, top_p = 1, top_k = 40, @@ -146,14 +153,14 @@ class Processor(LlmService): return resp - except ResourceExhausted as e: + except self.ResourceExhausted as e: logger.warning("Rate limit exceeded") # Leave rate limit retries to the default handler raise TooManyRequests() - except ClientError as e: + except self.ClientError as e: # google-genai SDK throws ClientError for 4xx errors if e.code == 429: logger.warning(f"Rate limit exceeded (ClientError 429): {e}") @@ -222,11 +229,11 @@ class Processor(LlmService): logger.debug("Streaming complete") - except ResourceExhausted: + except self.ResourceExhausted: logger.warning("Rate limit exceeded during streaming") raise TooManyRequests() - except ClientError as e: + except self.ClientError as e: # google-genai SDK throws ClientError for 4xx errors if e.code == 429: logger.warning(f"Rate limit exceeded during streaming (ClientError 429): {e}") From d7745baab45f93a93bad09505506c30941b6b0db Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Sat, 18 Apr 2026 11:18:34 +0100 Subject: [PATCH 5/7] Add Kafka pub/sub backend (#830) Third backend alongside Pulsar and RabbitMQ. Topics map 1:1 to Kafka topics, subscriptions map to consumer groups. Response/notify uses unique consumer groups with correlation ID filtering. Topic lifecycle managed via AdminClient with class-based retention. Initial code drop: Needs major integration testing --- docs/tech-specs/kafka-backend.md | 200 +++++++++ tests/unit/test_pubsub/test_kafka_backend.py | 131 ++++++ .../unit/test_pubsub/test_rabbitmq_backend.py | 2 - trustgraph-base/pyproject.toml | 1 + .../trustgraph/base/kafka_backend.py | 400 ++++++++++++++++++ trustgraph-base/trustgraph/base/pubsub.py | 57 +++ 6 files changed, 789 insertions(+), 2 deletions(-) create mode 100644 docs/tech-specs/kafka-backend.md create mode 100644 tests/unit/test_pubsub/test_kafka_backend.py create mode 100644 trustgraph-base/trustgraph/base/kafka_backend.py diff --git a/docs/tech-specs/kafka-backend.md b/docs/tech-specs/kafka-backend.md new file mode 100644 index 00000000..178ddfe4 --- /dev/null +++ b/docs/tech-specs/kafka-backend.md @@ -0,0 +1,200 @@ +--- +layout: default +title: "Kafka Pub/Sub Backend Technical Specification" +parent: "Tech Specs" +--- + +# Kafka Pub/Sub Backend Technical Specification + +## Overview + +Add Apache Kafka as a third pub/sub backend alongside Pulsar and RabbitMQ. +Kafka's topic model maps naturally to TrustGraph's pub/sub abstraction: +topics are first-class, consumer groups provide competing-consumer +semantics, and the AdminClient handles topic lifecycle. + +## Problem + +TrustGraph currently supports Pulsar and RabbitMQ. Kafka is widely +deployed and operationally familiar to many teams. Its log-based +architecture provides durable, replayable message streams with +well-understood scaling properties. + +## Design + +### Concept Mapping + +| TrustGraph concept | Kafka equivalent | +|---|---| +| Topic (`class:topicspace:topic`) | Kafka topic (named `topicspace.class.topic`) | +| Subscription (competing consumers) | Consumer group | +| `create_topic` / `delete_topic` | `AdminClient.create_topics()` / `delete_topics()` | +| `ensure_topic` | `AdminClient.create_topics()` (idempotent) | +| Producer | `KafkaProducer` | +| Consumer | `KafkaConsumer` in a consumer group | +| Message acknowledge | Commit offset | +| Message negative acknowledge | Seek back to message offset | + +### Topic Naming + +The topic name follows the same convention as the RabbitMQ exchange +name: + +``` +class:topicspace:topic -> topicspace.class.topic +``` + +Examples: +- `flow:tg:text-completion-request` -> `tg.flow.text-completion-request` +- `request:tg:librarian` -> `tg.request.librarian` +- `response:tg:config` -> `tg.response.config` + +### Topic Classes and Retention + +Kafka topics are always durable (log-based). The class prefix determines +retention policy rather than durability: + +| Class | Retention | Partitions | Notes | +|---|---|---|---| +| `flow` | Long or infinite | 1 | Data pipeline, order preserved | +| `request` | Short (e.g. 300s) | 1 | RPC requests, ephemeral | +| `response` | Short (e.g. 300s) | 1 | RPC responses, shared (see below) | +| `notify` | Short (e.g. 300s) | 1 | Broadcast signals | + +Single partition per topic preserves message ordering and makes +offset-based acknowledgment equivalent to per-message ack. This matches +the current `prefetch_count=1` model used across all backends. + +### Producers + +Straightforward `KafkaProducer` wrapping. Messages are serialised as +JSON (consistent with the RabbitMQ backend). Message properties/headers +map to Kafka record headers. + +### Consumers + +#### Flow and Request Class (Competing Consumers) + +Consumer group ID = subscription name. Multiple consumers in the same +group share the workload (Kafka's native consumer group rebalancing). + +``` +group_id = subscription # e.g. "triples-store--default--input" +``` + +#### Response and Notify Class (Per-Subscriber) + +This is where Kafka differs from RabbitMQ. Kafka has no anonymous +exclusive auto-delete queues. + +Design: use a **shared response topic with unique consumer groups**. +Each subscriber gets its own consumer group (using the existing +UUID-based subscription name from `RequestResponseSpec`). Every +subscriber reads all messages from the topic and filters by correlation +ID, discarding non-matching messages. + +This is slightly wasteful — N subscribers each read every response — but +request/response traffic is low-volume compared to the data pipeline. +The alternative (per-instance temporary topics) would require dynamic +topic creation/deletion for every API gateway request, which is +expensive in Kafka (AdminClient operations involve controller +coordination). + +### Acknowledgment + +#### Acknowledge (Success) + +Commit the message's offset. With a single partition and sequential +processing, this is equivalent to per-message ack: + +``` +consumer.commit(offsets={partition: offset + 1}) +``` + +#### Negative Acknowledge (Failure / Retry) + +Kafka has no native nack-with-redelivery. On processing failure, seek +the consumer back to the failed message's offset: + +``` +consumer.seek(partition, offset) +``` + +The message is redelivered on the next poll. This matches the current +RabbitMQ `basic_nack(requeue=True)` behaviour: the message is retried +by the same consumer. + +### Topic Lifecycle + +The flow service creates and deletes topics via the Kafka AdminClient: + +- **Flow start**: `AdminClient.create_topics()` for each unique topic + in the blueprint. Topic config includes `retention.ms` based on class. +- **Flow stop**: `AdminClient.delete_topics()` for the flow's topics. +- **Service startup**: `ensure_topic` creates the topic if it doesn't + exist (idempotent via `create_topics` with `validate_only=False`). + +Unlike RabbitMQ where consumers declare their own queues, Kafka topics +must exist before consumers connect. The flow service and service +startup `ensure_topic` calls handle this. + +### Message Encoding + +JSON body, consistent with the RabbitMQ backend. Serialisation uses the +existing `dataclass_to_dict` / `dict_to_dataclass` helpers. Message +properties map to Kafka record headers (byte-encoded string values). + +### Configuration + +New CLI arguments following the existing pattern: + +``` +--pubsub-backend kafka +--kafka-bootstrap-servers localhost:9092 +--kafka-security-protocol PLAINTEXT +--kafka-sasl-mechanism (optional) +--kafka-sasl-username (optional) +--kafka-sasl-password (optional) +``` + +The factory in `pubsub.py` creates a `KafkaBackend` instance when +`pubsub_backend='kafka'`. + +### Dependencies + +`kafka-python-ng` or `confluent-kafka`. The `confluent-kafka` package +provides both producer/consumer and AdminClient in one library with +better performance (C-backed librdkafka), but requires a C extension +build. `kafka-python-ng` is pure Python, simpler to install. + +## Key Design Decisions + +1. **Shared response topic with filtering** over per-instance temporary + topics. Avoids expensive dynamic topic creation for every RPC + exchange. Acceptable because response traffic is low-volume. + +2. **Seek-back for negative acknowledge** over not-committing or retry + topics. Provides immediate redelivery consistent with the RabbitMQ + nack behaviour. + +3. **Single partition per topic** to preserve ordering and simplify + offset management. Parallelism comes from multiple topics and + multiple services, not from partitioning within a topic. + +4. **Retention-based class semantics** instead of durability flags. + Kafka topics are always durable; short retention achieves the + ephemeral behaviour needed for request/response/notify classes. + +## Open Questions + +- **Retention values**: exact `retention.ms` for short-lived topic + classes. 300s (5 minutes) is a starting point; may need tuning based + on worst-case restart/reconnect times. + +- **Library choice**: `confluent-kafka` vs `kafka-python-ng`. Performance + vs install simplicity trade-off. Could support both behind a thin + wrapper. + +- **Consumer poll timeout**: needs to align with the existing + `receive(timeout_millis)` API. Kafka's `poll()` takes a timeout + directly, so this maps cleanly. diff --git a/tests/unit/test_pubsub/test_kafka_backend.py b/tests/unit/test_pubsub/test_kafka_backend.py new file mode 100644 index 00000000..456386f0 --- /dev/null +++ b/tests/unit/test_pubsub/test_kafka_backend.py @@ -0,0 +1,131 @@ +""" +Unit tests for Kafka backend — topic parsing and factory dispatch. +Does not require a running Kafka instance. +""" + +import pytest +import argparse + +from trustgraph.base.kafka_backend import KafkaBackend +from trustgraph.base.pubsub import get_pubsub, add_pubsub_args + + +class TestKafkaParseTopic: + + @pytest.fixture + def backend(self): + b = object.__new__(KafkaBackend) + return b + + def test_flow_is_durable(self, backend): + name, cls, durable = backend._parse_topic('flow:tg:text-completion-request') + assert durable is True + assert cls == 'flow' + assert name == 'tg.flow.text-completion-request' + + def test_notify_is_not_durable(self, backend): + name, cls, durable = backend._parse_topic('notify:tg:config') + assert durable is False + assert cls == 'notify' + assert name == 'tg.notify.config' + + def test_request_is_not_durable(self, backend): + name, cls, durable = backend._parse_topic('request:tg:config') + assert durable is False + assert cls == 'request' + assert name == 'tg.request.config' + + def test_response_is_not_durable(self, backend): + name, cls, durable = backend._parse_topic('response:tg:librarian') + assert durable is False + assert cls == 'response' + assert name == 'tg.response.librarian' + + def test_custom_topicspace(self, backend): + name, cls, durable = backend._parse_topic('flow:prod:my-queue') + assert name == 'prod.flow.my-queue' + assert durable is True + + def test_no_colon_defaults_to_flow(self, backend): + name, cls, durable = backend._parse_topic('simple-queue') + assert name == 'tg.flow.simple-queue' + assert cls == 'flow' + assert durable is True + + def test_invalid_class_raises(self, backend): + with pytest.raises(ValueError, match="Invalid topic class"): + backend._parse_topic('unknown:tg:topic') + + def test_topic_with_flow_suffix(self, backend): + """Topic names with flow suffix (e.g. :default) are preserved.""" + name, cls, durable = backend._parse_topic('request:tg:prompt:default') + assert name == 'tg.request.prompt:default' + + +class TestKafkaRetention: + + @pytest.fixture + def backend(self): + b = object.__new__(KafkaBackend) + return b + + def test_flow_gets_long_retention(self, backend): + assert backend._retention_ms('flow') == 7 * 24 * 60 * 60 * 1000 + + def test_request_gets_short_retention(self, backend): + assert backend._retention_ms('request') == 300 * 1000 + + def test_response_gets_short_retention(self, backend): + assert backend._retention_ms('response') == 300 * 1000 + + def test_notify_gets_short_retention(self, backend): + assert backend._retention_ms('notify') == 300 * 1000 + + +class TestGetPubsubKafka: + + def test_factory_creates_kafka_backend(self): + backend = get_pubsub(pubsub_backend='kafka') + assert isinstance(backend, KafkaBackend) + + def test_factory_passes_config(self): + backend = get_pubsub( + pubsub_backend='kafka', + kafka_bootstrap_servers='myhost:9093', + kafka_security_protocol='SASL_SSL', + kafka_sasl_mechanism='PLAIN', + kafka_sasl_username='user', + kafka_sasl_password='pass', + ) + assert isinstance(backend, KafkaBackend) + assert backend._bootstrap_servers == 'myhost:9093' + assert backend._admin_config['security.protocol'] == 'SASL_SSL' + assert backend._admin_config['sasl.mechanism'] == 'PLAIN' + assert backend._admin_config['sasl.username'] == 'user' + assert backend._admin_config['sasl.password'] == 'pass' + + +class TestAddPubsubArgsKafka: + + def test_kafka_args_present(self): + parser = argparse.ArgumentParser() + add_pubsub_args(parser) + args = parser.parse_args([ + '--pubsub-backend', 'kafka', + '--kafka-bootstrap-servers', 'myhost:9093', + ]) + assert args.pubsub_backend == 'kafka' + assert args.kafka_bootstrap_servers == 'myhost:9093' + + def test_kafka_defaults_container(self): + parser = argparse.ArgumentParser() + add_pubsub_args(parser) + args = parser.parse_args([]) + assert args.kafka_bootstrap_servers == 'kafka:9092' + assert args.kafka_security_protocol == 'PLAINTEXT' + + def test_kafka_standalone_defaults_to_localhost(self): + parser = argparse.ArgumentParser() + add_pubsub_args(parser, standalone=True) + args = parser.parse_args([]) + assert args.kafka_bootstrap_servers == 'localhost:9092' diff --git a/tests/unit/test_pubsub/test_rabbitmq_backend.py b/tests/unit/test_pubsub/test_rabbitmq_backend.py index 54599723..5bafb78f 100644 --- a/tests/unit/test_pubsub/test_rabbitmq_backend.py +++ b/tests/unit/test_pubsub/test_rabbitmq_backend.py @@ -6,8 +6,6 @@ Does not require a running RabbitMQ instance. import pytest import argparse -pika = pytest.importorskip("pika", reason="pika not installed") - from trustgraph.base.rabbitmq_backend import RabbitMQBackend from trustgraph.base.pubsub import get_pubsub, add_pubsub_args diff --git a/trustgraph-base/pyproject.toml b/trustgraph-base/pyproject.toml index 4f1bce76..c4ef3c27 100644 --- a/trustgraph-base/pyproject.toml +++ b/trustgraph-base/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "requests", "python-logging-loki", "pika", + "confluent-kafka", "pyyaml", ] classifiers = [ diff --git a/trustgraph-base/trustgraph/base/kafka_backend.py b/trustgraph-base/trustgraph/base/kafka_backend.py new file mode 100644 index 00000000..8dfe8bfa --- /dev/null +++ b/trustgraph-base/trustgraph/base/kafka_backend.py @@ -0,0 +1,400 @@ +""" +Kafka backend implementation for pub/sub abstraction. + +Each logical topic maps to a Kafka topic. The topic name encodes +the full identity: + + class:topicspace:topic -> topicspace.class.topic + +Producers publish to the topic directly. +Consumers use consumer groups for competing-consumer semantics: + + - flow / request: named consumer group (competing consumers) + - response / notify: unique consumer group per instance, filtering + messages by correlation ID (all subscribers see all messages) + +The flow service manages topic lifecycle via AdminClient. + +Architecture: + Producer --> [Kafka topic] --> Consumer Group A --> Consumer + --> Consumer Group A --> Consumer + --> Consumer Group B --> Consumer (response) +""" + +import asyncio +import json +import logging +import uuid +from typing import Any + +from confluent_kafka import ( + Producer as KafkaProducer, + Consumer as KafkaConsumer, + TopicPartition, + KafkaError, + KafkaException, +) +from confluent_kafka.admin import AdminClient, NewTopic + +from .backend import PubSubBackend, BackendProducer, BackendConsumer, Message +from .serialization import dataclass_to_dict, dict_to_dataclass + +logger = logging.getLogger(__name__) + +# Retention defaults (milliseconds) +LONG_RETENTION_MS = 7 * 24 * 60 * 60 * 1000 # 7 days +SHORT_RETENTION_MS = 300 * 1000 # 5 minutes + + +class KafkaMessage: + """Wrapper for Kafka messages to match Message protocol.""" + + def __init__(self, msg, schema_cls): + self._msg = msg + self._schema_cls = schema_cls + self._value = None + + def value(self) -> Any: + """Deserialize and return the message value as a dataclass.""" + if self._value is None: + data_dict = json.loads(self._msg.value().decode('utf-8')) + self._value = dict_to_dataclass(data_dict, self._schema_cls) + return self._value + + def properties(self) -> dict: + """Return message properties from Kafka headers.""" + headers = self._msg.headers() or [] + return { + k: v.decode('utf-8') if isinstance(v, bytes) else v + for k, v in headers + } + + +class KafkaBackendProducer: + """Publishes messages to a Kafka topic. + + confluent-kafka Producer is thread-safe, so a single instance + can be shared across threads. + """ + + def __init__(self, bootstrap_servers, topic_name, durable): + self._topic_name = topic_name + self._durable = durable + self._producer = KafkaProducer({ + 'bootstrap.servers': bootstrap_servers, + 'acks': 'all' if durable else '1', + }) + + def send(self, message: Any, properties: dict = {}) -> None: + data_dict = dataclass_to_dict(message) + json_data = json.dumps(data_dict).encode('utf-8') + + headers = [ + (k, str(v).encode('utf-8')) + for k, v in properties.items() + ] if properties else None + + self._producer.produce( + topic=self._topic_name, + value=json_data, + headers=headers, + ) + self._producer.flush() + + def flush(self) -> None: + self._producer.flush() + + def close(self) -> None: + self._producer.flush() + + +class KafkaBackendConsumer: + """Consumes from a Kafka topic using a consumer group. + + Uses confluent-kafka Consumer.poll() for message delivery. + Not thread-safe — each instance must be used from a single thread, + which matches the ThreadPoolExecutor pattern in consumer.py. + """ + + def __init__(self, bootstrap_servers, topic_name, group_id, + schema_cls, auto_offset_reset='latest'): + self._bootstrap_servers = bootstrap_servers + self._topic_name = topic_name + self._group_id = group_id + self._schema_cls = schema_cls + self._auto_offset_reset = auto_offset_reset + self._consumer = None + + def _connect(self): + self._consumer = KafkaConsumer({ + 'bootstrap.servers': self._bootstrap_servers, + 'group.id': self._group_id, + 'auto.offset.reset': self._auto_offset_reset, + 'enable.auto.commit': False, + }) + self._consumer.subscribe([self._topic_name]) + logger.info( + f"Kafka consumer connected: topic={self._topic_name}, " + f"group={self._group_id}" + ) + + def _is_alive(self): + return self._consumer is not None + + def ensure_connected(self) -> None: + """Eagerly connect and subscribe. + + For response/notify consumers this must be called before the + corresponding request is published, so that the consumer is + assigned a partition and will see the response message. + """ + if not self._is_alive(): + self._connect() + + # Force a partition assignment by polling briefly. + # Without this, the consumer may not be assigned partitions + # until the first real poll(), creating a race where the + # request is sent before assignment completes. + self._consumer.poll(timeout=1.0) + + def receive(self, timeout_millis: int = 2000) -> Message: + """Receive a message. Raises TimeoutError if none available.""" + if not self._is_alive(): + self._connect() + + timeout_seconds = timeout_millis / 1000.0 + msg = self._consumer.poll(timeout=timeout_seconds) + + if msg is None: + raise TimeoutError("No message received within timeout") + + if msg.error(): + error = msg.error() + if error.code() == KafkaError._PARTITION_EOF: + raise TimeoutError("End of partition reached") + raise KafkaException(error) + + return KafkaMessage(msg, self._schema_cls) + + def acknowledge(self, message: Message) -> None: + """Commit the message's offset (next offset to read).""" + if isinstance(message, KafkaMessage) and message._msg: + tp = TopicPartition( + message._msg.topic(), + message._msg.partition(), + message._msg.offset() + 1, + ) + self._consumer.commit(offsets=[tp], asynchronous=False) + + def negative_acknowledge(self, message: Message) -> None: + """Seek back to the message's offset for redelivery.""" + if isinstance(message, KafkaMessage) and message._msg: + tp = TopicPartition( + message._msg.topic(), + message._msg.partition(), + message._msg.offset(), + ) + self._consumer.seek(tp) + + def unsubscribe(self) -> None: + if self._consumer: + try: + self._consumer.unsubscribe() + except Exception: + pass + + def close(self) -> None: + if self._consumer: + try: + self._consumer.close() + except Exception: + pass + self._consumer = None + + +class KafkaBackend: + """Kafka pub/sub backend using one topic per logical topic.""" + + def __init__(self, bootstrap_servers='localhost:9092', + security_protocol='PLAINTEXT', + sasl_mechanism=None, + sasl_username=None, + sasl_password=None): + self._bootstrap_servers = bootstrap_servers + + # AdminClient config + self._admin_config = { + 'bootstrap.servers': bootstrap_servers, + } + + if security_protocol != 'PLAINTEXT': + self._admin_config['security.protocol'] = security_protocol + if sasl_mechanism: + self._admin_config['sasl.mechanism'] = sasl_mechanism + if sasl_username: + self._admin_config['sasl.username'] = sasl_username + if sasl_password: + self._admin_config['sasl.password'] = sasl_password + + logger.info( + f"Kafka backend: {bootstrap_servers} " + f"protocol={security_protocol}" + ) + + def _parse_topic(self, topic_id: str) -> tuple[str, str, bool]: + """ + Parse topic identifier into Kafka topic name, class, and durability. + + Format: class:topicspace:topic + Returns: (topic_name, class, durable) + """ + if ':' not in topic_id: + return f'tg.flow.{topic_id}', 'flow', True + + parts = topic_id.split(':', 2) + if len(parts) != 3: + raise ValueError( + f"Invalid topic format: {topic_id}, " + f"expected class:topicspace:topic" + ) + + cls, topicspace, topic = parts + + if cls == 'flow': + durable = True + elif cls in ('request', 'response', 'notify'): + durable = False + else: + raise ValueError( + f"Invalid topic class: {cls}, " + f"expected flow, request, response, or notify" + ) + + topic_name = f"{topicspace}.{cls}.{topic}" + + return topic_name, cls, durable + + def _retention_ms(self, cls): + """Return retention.ms for a topic class.""" + if cls == 'flow': + return LONG_RETENTION_MS + return SHORT_RETENTION_MS + + def create_producer(self, topic: str, schema: type, + **options) -> BackendProducer: + topic_name, cls, durable = self._parse_topic(topic) + logger.debug(f"Creating producer: topic={topic_name}") + return KafkaBackendProducer( + self._bootstrap_servers, topic_name, durable, + ) + + def create_consumer(self, topic: str, subscription: str, schema: type, + initial_position: str = 'latest', + **options) -> BackendConsumer: + """Create a consumer subscribed to a Kafka topic. + + Behaviour is determined by the topic's class prefix: + - flow: named consumer group, competing consumers + - request: named consumer group, competing consumers + - response: unique consumer group per instance + - notify: unique consumer group per instance + """ + topic_name, cls, durable = self._parse_topic(topic) + + if cls in ('response', 'notify'): + # Per-subscriber: unique group so every instance sees + # every message. Filter by correlation ID happens at + # the Subscriber layer above. + group_id = f"{subscription}-{uuid.uuid4()}" + auto_offset_reset = 'latest' + else: + # Shared: named group, competing consumers + group_id = subscription + auto_offset_reset = ( + 'earliest' if initial_position == 'earliest' + else 'latest' + ) + + logger.debug( + f"Creating consumer: topic={topic_name}, " + f"group={group_id}, cls={cls}" + ) + + return KafkaBackendConsumer( + self._bootstrap_servers, topic_name, group_id, + schema, auto_offset_reset, + ) + + def _create_topic_sync(self, topic_name, retention_ms): + """Blocking topic creation via AdminClient.""" + admin = AdminClient(self._admin_config) + new_topic = NewTopic( + topic_name, + num_partitions=1, + replication_factor=1, + config={ + 'retention.ms': str(retention_ms), + }, + ) + fs = admin.create_topics([new_topic]) + for name, f in fs.items(): + try: + f.result() + logger.info(f"Created topic: {name}") + except KafkaException as e: + # Topic already exists — idempotent + if e.args[0].code() == KafkaError.TOPIC_ALREADY_EXISTS: + logger.debug(f"Topic already exists: {name}") + else: + raise + + async def create_topic(self, topic: str) -> None: + """Create a Kafka topic with appropriate retention.""" + topic_name, cls, durable = self._parse_topic(topic) + retention_ms = self._retention_ms(cls) + await asyncio.to_thread( + self._create_topic_sync, topic_name, retention_ms, + ) + + def _delete_topic_sync(self, topic_name): + """Blocking topic deletion via AdminClient.""" + admin = AdminClient(self._admin_config) + fs = admin.delete_topics([topic_name]) + for name, f in fs.items(): + try: + f.result() + logger.info(f"Deleted topic: {name}") + except KafkaException as e: + # Topic doesn't exist — idempotent + if e.args[0].code() == KafkaError.UNKNOWN_TOPIC_OR_PART: + logger.debug(f"Topic not found: {name}") + else: + raise + except Exception as e: + logger.debug(f"Topic delete for {name}: {e}") + + async def delete_topic(self, topic: str) -> None: + """Delete a Kafka topic.""" + topic_name, cls, durable = self._parse_topic(topic) + await asyncio.to_thread(self._delete_topic_sync, topic_name) + + def _topic_exists_sync(self, topic_name): + """Blocking topic existence check via AdminClient.""" + admin = AdminClient(self._admin_config) + metadata = admin.list_topics(timeout=10) + return topic_name in metadata.topics + + async def topic_exists(self, topic: str) -> bool: + """Check whether a Kafka topic exists.""" + topic_name, cls, durable = self._parse_topic(topic) + return await asyncio.to_thread( + self._topic_exists_sync, topic_name, + ) + + async def ensure_topic(self, topic: str) -> None: + """Ensure a topic exists, creating it if necessary.""" + if not await self.topic_exists(topic): + await self.create_topic(topic) + + def close(self) -> None: + pass diff --git a/trustgraph-base/trustgraph/base/pubsub.py b/trustgraph-base/trustgraph/base/pubsub.py index a7ae3719..fb4765c1 100644 --- a/trustgraph-base/trustgraph/base/pubsub.py +++ b/trustgraph-base/trustgraph/base/pubsub.py @@ -17,6 +17,12 @@ DEFAULT_RABBITMQ_USERNAME = os.getenv("RABBITMQ_USERNAME", 'guest') DEFAULT_RABBITMQ_PASSWORD = os.getenv("RABBITMQ_PASSWORD", 'guest') DEFAULT_RABBITMQ_VHOST = os.getenv("RABBITMQ_VHOST", '/') +DEFAULT_KAFKA_BOOTSTRAP = os.getenv("KAFKA_BOOTSTRAP_SERVERS", 'kafka:9092') +DEFAULT_KAFKA_PROTOCOL = os.getenv("KAFKA_SECURITY_PROTOCOL", 'PLAINTEXT') +DEFAULT_KAFKA_SASL_MECHANISM = os.getenv("KAFKA_SASL_MECHANISM", None) +DEFAULT_KAFKA_SASL_USERNAME = os.getenv("KAFKA_SASL_USERNAME", None) +DEFAULT_KAFKA_SASL_PASSWORD = os.getenv("KAFKA_SASL_PASSWORD", None) + def get_pubsub(**config: Any) -> Any: """ @@ -47,6 +53,25 @@ def get_pubsub(**config: Any) -> Any: password=config.get('rabbitmq_password', DEFAULT_RABBITMQ_PASSWORD), vhost=config.get('rabbitmq_vhost', DEFAULT_RABBITMQ_VHOST), ) + elif backend_type == 'kafka': + from .kafka_backend import KafkaBackend + return KafkaBackend( + bootstrap_servers=config.get( + 'kafka_bootstrap_servers', DEFAULT_KAFKA_BOOTSTRAP, + ), + security_protocol=config.get( + 'kafka_security_protocol', DEFAULT_KAFKA_PROTOCOL, + ), + sasl_mechanism=config.get( + 'kafka_sasl_mechanism', DEFAULT_KAFKA_SASL_MECHANISM, + ), + sasl_username=config.get( + 'kafka_sasl_username', DEFAULT_KAFKA_SASL_USERNAME, + ), + sasl_password=config.get( + 'kafka_sasl_password', DEFAULT_KAFKA_SASL_PASSWORD, + ), + ) else: raise ValueError(f"Unknown pub/sub backend: {backend_type}") @@ -65,6 +90,7 @@ def add_pubsub_args(parser: ArgumentParser, standalone: bool = False) -> None: pulsar_host = STANDALONE_PULSAR_HOST if standalone else DEFAULT_PULSAR_HOST pulsar_listener = 'localhost' if standalone else None rabbitmq_host = 'localhost' if standalone else DEFAULT_RABBITMQ_HOST + kafka_bootstrap = 'localhost:9092' if standalone else DEFAULT_KAFKA_BOOTSTRAP parser.add_argument( '--pubsub-backend', @@ -122,3 +148,34 @@ def add_pubsub_args(parser: ArgumentParser, standalone: bool = False) -> None: default=DEFAULT_RABBITMQ_VHOST, help=f'RabbitMQ vhost (default: {DEFAULT_RABBITMQ_VHOST})', ) + + # Kafka options + parser.add_argument( + '--kafka-bootstrap-servers', + default=kafka_bootstrap, + help=f'Kafka bootstrap servers (default: {kafka_bootstrap})', + ) + + parser.add_argument( + '--kafka-security-protocol', + default=DEFAULT_KAFKA_PROTOCOL, + help=f'Kafka security protocol (default: {DEFAULT_KAFKA_PROTOCOL})', + ) + + parser.add_argument( + '--kafka-sasl-mechanism', + default=DEFAULT_KAFKA_SASL_MECHANISM, + help='Kafka SASL mechanism', + ) + + parser.add_argument( + '--kafka-sasl-username', + default=DEFAULT_KAFKA_SASL_USERNAME, + help='Kafka SASL username', + ) + + parser.add_argument( + '--kafka-sasl-password', + default=DEFAULT_KAFKA_SASL_PASSWORD, + help='Kafka SASL password', + ) From cce3acd84f4e89d755d2e032e2a5bae210343c18 Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Sat, 18 Apr 2026 11:43:21 +0100 Subject: [PATCH 6/7] fix: repair deferred imports to preserve module-level names for test patching (#831) A previous commit moved SDK imports into __init__/methods and stashed them on self, which broke @patch targets in 24 unit tests. This fixes the approach: chunker and pdf_decoder use module-level sentinels with global/if-None guards so imports are still deferred but patchable. Google AI Studio reverts to standard module-level imports since the module is only loaded when communicating with Gemini. Keeps lazy loading on other imports. --- .../trustgraph/chunking/recursive/chunker.py | 14 ++++-- .../trustgraph/chunking/token/chunker.py | 12 +++-- .../trustgraph/decoding/pdf/pdf_decoder.py | 10 +++- .../text_completion/googleaistudio/llm.py | 48 ++++++++----------- 4 files changed, 48 insertions(+), 36 deletions(-) diff --git a/trustgraph-flow/trustgraph/chunking/recursive/chunker.py b/trustgraph-flow/trustgraph/chunking/recursive/chunker.py index 257edc8c..dc7b357c 100755 --- a/trustgraph-flow/trustgraph/chunking/recursive/chunker.py +++ b/trustgraph-flow/trustgraph/chunking/recursive/chunker.py @@ -10,6 +10,8 @@ from prometheus_client import Histogram from ... schema import TextDocument, Chunk, Metadata, Triples from ... base import ChunkingService, ConsumerSpec, ProducerSpec +RecursiveCharacterTextSplitter = None + from ... provenance import ( chunk_uri as make_chunk_uri, derived_entity_triples, set_graph, GRAPH_SOURCE, @@ -41,8 +43,12 @@ class Processor(ChunkingService): self.default_chunk_size = chunk_size self.default_chunk_overlap = chunk_overlap - from langchain_text_splitters import RecursiveCharacterTextSplitter - self.RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter + global RecursiveCharacterTextSplitter + if RecursiveCharacterTextSplitter is None: + from langchain_text_splitters import ( + RecursiveCharacterTextSplitter as _cls, + ) + RecursiveCharacterTextSplitter = _cls if not hasattr(__class__, "chunk_metric"): __class__.chunk_metric = Histogram( @@ -52,7 +58,7 @@ class Processor(ChunkingService): 2500, 4000, 6400, 10000, 16000] ) - self.text_splitter = self.RecursiveCharacterTextSplitter( + self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, @@ -105,7 +111,7 @@ class Processor(ChunkingService): chunk_overlap = int(chunk_overlap) # Create text splitter with effective parameters - text_splitter = self.RecursiveCharacterTextSplitter( + text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, diff --git a/trustgraph-flow/trustgraph/chunking/token/chunker.py b/trustgraph-flow/trustgraph/chunking/token/chunker.py index d315c428..3f31beb9 100755 --- a/trustgraph-flow/trustgraph/chunking/token/chunker.py +++ b/trustgraph-flow/trustgraph/chunking/token/chunker.py @@ -10,6 +10,8 @@ from prometheus_client import Histogram from ... schema import TextDocument, Chunk, Metadata, Triples from ... base import ChunkingService, ConsumerSpec, ProducerSpec +TokenTextSplitter = None + from ... provenance import ( chunk_uri as make_chunk_uri, derived_entity_triples, set_graph, GRAPH_SOURCE, @@ -41,8 +43,10 @@ class Processor(ChunkingService): self.default_chunk_size = chunk_size self.default_chunk_overlap = chunk_overlap - from langchain_text_splitters import TokenTextSplitter - self.TokenTextSplitter = TokenTextSplitter + global TokenTextSplitter + if TokenTextSplitter is None: + from langchain_text_splitters import TokenTextSplitter as _cls + TokenTextSplitter = _cls if not hasattr(__class__, "chunk_metric"): __class__.chunk_metric = Histogram( @@ -52,7 +56,7 @@ class Processor(ChunkingService): 2500, 4000, 6400, 10000, 16000] ) - self.text_splitter = self.TokenTextSplitter( + self.text_splitter = TokenTextSplitter( encoding_name="cl100k_base", chunk_size=chunk_size, chunk_overlap=chunk_overlap, @@ -104,7 +108,7 @@ class Processor(ChunkingService): chunk_overlap = int(chunk_overlap) # Create text splitter with effective parameters - text_splitter = self.TokenTextSplitter( + text_splitter = TokenTextSplitter( encoding_name="cl100k_base", chunk_size=chunk_size, chunk_overlap=chunk_overlap, diff --git a/trustgraph-flow/trustgraph/decoding/pdf/pdf_decoder.py b/trustgraph-flow/trustgraph/decoding/pdf/pdf_decoder.py index ab31b717..7f9ca71d 100755 --- a/trustgraph-flow/trustgraph/decoding/pdf/pdf_decoder.py +++ b/trustgraph-flow/trustgraph/decoding/pdf/pdf_decoder.py @@ -15,6 +15,9 @@ from ... schema import Document, TextDocument, Metadata from ... schema import librarian_request_queue, librarian_response_queue from ... schema import Triples from ... base import FlowProcessor, ConsumerSpec, ProducerSpec, LibrarianClient + +PyPDFLoader = None + from ... provenance import ( document_uri, page_uri as make_page_uri, derived_entity_triples, set_graph, GRAPH_SOURCE, @@ -128,7 +131,12 @@ class Processor(FlowProcessor): fp.write(base64.b64decode(v.data)) fp.close() - from langchain_community.document_loaders import PyPDFLoader + global PyPDFLoader + if PyPDFLoader is None: + from langchain_community.document_loaders import ( + PyPDFLoader as _cls, + ) + PyPDFLoader = _cls loader = PyPDFLoader(temp_path) pages = loader.load() diff --git a/trustgraph-vertexai/trustgraph/model/text_completion/googleaistudio/llm.py b/trustgraph-vertexai/trustgraph/model/text_completion/googleaistudio/llm.py index 9f431d20..b01ff410 100644 --- a/trustgraph-vertexai/trustgraph/model/text_completion/googleaistudio/llm.py +++ b/trustgraph-vertexai/trustgraph/model/text_completion/googleaistudio/llm.py @@ -18,6 +18,12 @@ import logging # Module logger logger = logging.getLogger(__name__) +from google import genai +from google.genai import types +from google.genai.types import HarmCategory, HarmBlockThreshold +from google.genai.errors import ClientError +from google.api_core.exceptions import ResourceExhausted + from .... exceptions import TooManyRequests from .... base import LlmService, LlmResult, LlmChunk @@ -37,18 +43,6 @@ class Processor(LlmService): temperature = params.get("temperature", default_temperature) max_output = params.get("max_output", default_max_output) - from google import genai - from google.genai import types - from google.genai.types import HarmCategory, HarmBlockThreshold - from google.genai.errors import ClientError - from google.api_core.exceptions import ResourceExhausted - self.genai = genai - self.types = types - self.HarmCategory = HarmCategory - self.HarmBlockThreshold = HarmBlockThreshold - self.ClientError = ClientError - self.ResourceExhausted = ResourceExhausted - if api_key is None: raise RuntimeError("Google AI Studio API key not specified") @@ -60,7 +54,7 @@ class Processor(LlmService): } ) - self.client = self.genai.Client(api_key=api_key, vertexai=False) + self.client = genai.Client(api_key=api_key, vertexai=False) self.default_model = model self.temperature = temperature self.max_output = max_output @@ -68,23 +62,23 @@ class Processor(LlmService): # Cache for generation configs per model self.generation_configs = {} - block_level = self.HarmBlockThreshold.BLOCK_ONLY_HIGH + block_level = HarmBlockThreshold.BLOCK_ONLY_HIGH self.safety_settings = [ - self.types.SafetySetting( - category = self.HarmCategory.HARM_CATEGORY_HATE_SPEECH, + types.SafetySetting( + category = HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold = block_level, ), - self.types.SafetySetting( - category = self.HarmCategory.HARM_CATEGORY_HARASSMENT, + types.SafetySetting( + category = HarmCategory.HARM_CATEGORY_HARASSMENT, threshold = block_level, ), - self.types.SafetySetting( - category = self.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, + types.SafetySetting( + category = HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, threshold = block_level, ), - self.types.SafetySetting( - category = self.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, + types.SafetySetting( + category = HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold = block_level, ), # There is a documentation conflict on whether or not @@ -104,7 +98,7 @@ class Processor(LlmService): if cache_key not in self.generation_configs: logger.info(f"Creating generation config for '{model_name}' with temperature {effective_temperature}") - self.generation_configs[cache_key] = self.types.GenerateContentConfig( + self.generation_configs[cache_key] = types.GenerateContentConfig( temperature = effective_temperature, top_p = 1, top_k = 40, @@ -153,14 +147,14 @@ class Processor(LlmService): return resp - except self.ResourceExhausted as e: + except ResourceExhausted as e: logger.warning("Rate limit exceeded") # Leave rate limit retries to the default handler raise TooManyRequests() - except self.ClientError as e: + except ClientError as e: # google-genai SDK throws ClientError for 4xx errors if e.code == 429: logger.warning(f"Rate limit exceeded (ClientError 429): {e}") @@ -229,11 +223,11 @@ class Processor(LlmService): logger.debug("Streaming complete") - except self.ResourceExhausted: + except ResourceExhausted: logger.warning("Rate limit exceeded during streaming") raise TooManyRequests() - except self.ClientError as e: + except ClientError as e: # google-genai SDK throws ClientError for 4xx errors if e.code == 429: logger.warning(f"Rate limit exceeded during streaming (ClientError 429): {e}") From adea97620307b5e249dd9c16f1fbd33000c333eb Mon Sep 17 00:00:00 2001 From: Het Patel <102606191+CuriousHet@users.noreply.github.com> Date: Sat, 18 Apr 2026 16:35:19 +0530 Subject: [PATCH 7/7] feat: implement retry logic and exponential backoff for S3 operations (#829) * feat: implement retry logic and exponential backoff for S3 operations * test: fix librarian mocks after BlobStore async conversion --- tests/unit/test_librarian/test_blob_store.py | 119 ++++++++++++++++++ .../test_librarian/test_chunked_upload.py | 4 + .../trustgraph/librarian/blob_store.py | 69 +++++++--- .../trustgraph/librarian/librarian.py | 8 +- 4 files changed, 179 insertions(+), 21 deletions(-) create mode 100644 tests/unit/test_librarian/test_blob_store.py diff --git a/tests/unit/test_librarian/test_blob_store.py b/tests/unit/test_librarian/test_blob_store.py new file mode 100644 index 00000000..b2b1541b --- /dev/null +++ b/tests/unit/test_librarian/test_blob_store.py @@ -0,0 +1,119 @@ +import asyncio +import io +import pytest +from unittest.mock import MagicMock, patch, AsyncMock +from uuid import uuid4 +from minio.error import S3Error +from trustgraph.librarian.blob_store import BlobStore + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_blob_store(): + """Create a BlobStore with mocked Minio client.""" + mock_minio = MagicMock() + with patch('trustgraph.librarian.blob_store.Minio', return_value=mock_minio): + # Prevent ensure_bucket from making network calls during init + with patch('trustgraph.librarian.blob_store.BlobStore.ensure_bucket'): + store = BlobStore( + endpoint="localhost:9000", + access_key="access", + secret_key="secret", + bucket_name="test-bucket" + ) + return store, mock_minio + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_add_success_no_retry(): + store, mock_minio = _make_blob_store() + object_id = uuid4() + + await store.add(object_id, b"data", "text/plain") + + mock_minio.put_object.assert_called_once() + +@pytest.mark.asyncio +async def test_retry_recovery_on_transient_failure(): + store, mock_minio = _make_blob_store() + store.base_delay = 0 # Disable delay for fast tests + + # Fail twice, succeed third time + mock_minio.put_object.side_effect = [ + Exception("Error 1"), + Exception("Error 2"), + MagicMock() + ] + + await store.add(uuid4(), b"data", "text/plain") + + assert mock_minio.put_object.call_count == 3 + +@pytest.mark.asyncio +async def test_retry_exhaustion_after_8_attempts(): + store, mock_minio = _make_blob_store() + store.base_delay = 0 + + # Permanent failure + mock_minio.put_object.side_effect = Exception("Permanent failure") + + with pytest.raises(Exception, match="Permanent failure"): + await store.add(uuid4(), b"data", "text/plain") + + # Author requirement: exactly 8 attempts + assert mock_minio.put_object.call_count == 8 + +@pytest.mark.asyncio +async def test_s3_error_triggers_retry(): + store, mock_minio = _make_blob_store() + store.base_delay = 0 + + # Mock S3Error + s3_err = S3Error("code", "msg", "res", "req", "host", None) + mock_minio.get_object.side_effect = [s3_err, MagicMock()] + + await store.get(uuid4()) + + assert mock_minio.get_object.call_count == 2 + +@pytest.mark.asyncio +async def test_exponential_backoff_delays(): + store, mock_minio = _make_blob_store() + # Use real base_delay to check math + store.base_delay = 0.25 + + # Correct method name is stat_object, not get_size + mock_minio.stat_object = MagicMock(side_effect=Exception("Wait")) + + with patch('asyncio.sleep', new_callable=AsyncMock) as mock_sleep: + with pytest.raises(Exception): + await store.get_size(uuid4()) + + # Should have 7 sleep calls for 8 attempts + assert mock_sleep.call_count == 7 + + # Check actual sleep durations: 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 16.0 + sleep_args = [call[0][0] for call in mock_sleep.call_args_list] + assert sleep_args == [0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 16.0] + +@pytest.mark.asyncio +async def test_runs_in_executor(): + """Verify that synchronous Minio calls are offloaded to an executor.""" + store, mock_minio = _make_blob_store() + + # Mock response object with .read() method + mock_response = MagicMock() + mock_response.read.return_value = b"result" + + with patch('asyncio.get_event_loop') as mock_loop: + mock_loop_instance = MagicMock() + mock_loop.return_value = mock_loop_instance + mock_loop_instance.run_in_executor = AsyncMock(return_value=mock_response) + + await store.get(uuid4()) + + mock_loop_instance.run_in_executor.assert_called_once() diff --git a/tests/unit/test_librarian/test_chunked_upload.py b/tests/unit/test_librarian/test_chunked_upload.py index d0b73d48..eef83e1e 100644 --- a/tests/unit/test_librarian/test_chunked_upload.py +++ b/tests/unit/test_librarian/test_chunked_upload.py @@ -22,6 +22,10 @@ def _make_librarian(min_chunk_size=1): """Create a Librarian with mocked blob_store and table_store.""" lib = Librarian.__new__(Librarian) lib.blob_store = MagicMock() + lib.blob_store.create_multipart_upload = AsyncMock() + lib.blob_store.upload_part = AsyncMock() + lib.blob_store.complete_multipart_upload = AsyncMock() + lib.blob_store.abort_multipart_upload = AsyncMock() lib.table_store = AsyncMock() lib.load_document = AsyncMock() lib.min_chunk_size = min_chunk_size diff --git a/trustgraph-flow/trustgraph/librarian/blob_store.py b/trustgraph-flow/trustgraph/librarian/blob_store.py index d75a7af9..55ed6c61 100644 --- a/trustgraph-flow/trustgraph/librarian/blob_store.py +++ b/trustgraph-flow/trustgraph/librarian/blob_store.py @@ -4,11 +4,12 @@ from .. exceptions import RequestError from minio import Minio from minio.datatypes import Part -import time +from minio.error import S3Error import io import logging from typing import Iterator, List, Tuple from uuid import UUID +import asyncio # Module logger logger = logging.getLogger(__name__) @@ -35,8 +36,36 @@ class BlobStore: protocol = "https" if use_ssl else "http" logger.info(f"Connected to S3-compatible storage at {protocol}://{endpoint}") + # Retry and Exponential delay configuration + self.max_retries = 8 + self.base_delay = 0.25 + self.ensure_bucket() + async def _with_retry(self, operation, *args, **kwargs): + """Execute a minio operation with exponential backoff retry.""" + last_exception = None + for attempt in range(self.max_retries): + try: + # Run the synchronous minio call in the default executor to avoid blocking + return await asyncio.get_event_loop().run_in_executor( + None, lambda: operation(*args, **kwargs) + ) + except (S3Error, Exception) as e: + last_exception = e + if attempt < self.max_retries - 1: + delay = self.base_delay * (2 ** attempt) + logger.warning( + f"S3 operation failed: {e}. " + f"Retrying in {delay}s... (Attempt {attempt + 1}/{self.max_retries})" + ) + await asyncio.sleep(delay) + else: + logger.error(f"S3 operation failed after {self.max_retries} attempts: {e}") + + if last_exception: + raise last_exception + def ensure_bucket(self): # Make the bucket if it doesn't exist. @@ -49,8 +78,8 @@ class BlobStore: async def add(self, object_id, blob, kind): - # FIXME: Loop retry - self.client.put_object( + await self._with_retry( + self.client.put_object, bucket_name = self.bucket_name, object_name = "doc/" + str(object_id), length = len(blob), @@ -62,8 +91,8 @@ class BlobStore: async def remove(self, object_id): - # FIXME: Loop retry - self.client.remove_object( + await self._with_retry( + self.client.remove_object, bucket_name = self.bucket_name, object_name = "doc/" + str(object_id), ) @@ -73,8 +102,8 @@ class BlobStore: async def get(self, object_id): - # FIXME: Loop retry - resp = self.client.get_object( + resp = await self._with_retry( + self.client.get_object, bucket_name = self.bucket_name, object_name = "doc/" + str(object_id), ) @@ -83,7 +112,8 @@ class BlobStore: async def get_range(self, object_id, offset: int, length: int) -> bytes: """Fetch a specific byte range from an object.""" - resp = self.client.get_object( + resp = await self._with_retry( + self.client.get_object, bucket_name=self.bucket_name, object_name="doc/" + str(object_id), offset=offset, @@ -97,7 +127,8 @@ class BlobStore: async def get_size(self, object_id) -> int: """Get the size of an object without downloading it.""" - stat = self.client.stat_object( + stat = await self._with_retry( + self.client.stat_object, bucket_name=self.bucket_name, object_name="doc/" + str(object_id), ) @@ -134,7 +165,7 @@ class BlobStore: logger.debug("Stream complete") - def create_multipart_upload(self, object_id: UUID, kind: str) -> str: + async def create_multipart_upload(self, object_id: UUID, kind: str) -> str: """ Initialize a multipart upload. @@ -148,7 +179,8 @@ class BlobStore: object_name = "doc/" + str(object_id) # Use minio's internal method to create multipart upload - upload_id = self.client._create_multipart_upload( + upload_id = await self._with_retry( + self.client._create_multipart_upload, bucket_name=self.bucket_name, object_name=object_name, headers={"Content-Type": kind}, @@ -157,7 +189,7 @@ class BlobStore: logger.info(f"Created multipart upload {upload_id} for {object_id}") return upload_id - def upload_part( + async def upload_part( self, object_id: UUID, upload_id: str, @@ -178,7 +210,8 @@ class BlobStore: """ object_name = "doc/" + str(object_id) - etag = self.client._upload_part( + etag = await self._with_retry( + self.client._upload_part, bucket_name=self.bucket_name, object_name=object_name, data=data, @@ -190,7 +223,7 @@ class BlobStore: logger.debug(f"Uploaded part {part_number} for {object_id}, etag={etag}") return etag - def complete_multipart_upload( + async def complete_multipart_upload( self, object_id: UUID, upload_id: str, @@ -214,7 +247,8 @@ class BlobStore: for part_number, etag in parts ] - self.client._complete_multipart_upload( + await self._with_retry( + self.client._complete_multipart_upload, bucket_name=self.bucket_name, object_name=object_name, upload_id=upload_id, @@ -223,7 +257,7 @@ class BlobStore: logger.info(f"Completed multipart upload for {object_id}") - def abort_multipart_upload(self, object_id: UUID, upload_id: str) -> None: + async def abort_multipart_upload(self, object_id: UUID, upload_id: str) -> None: """ Abort a multipart upload, cleaning up any uploaded parts. @@ -233,7 +267,8 @@ class BlobStore: """ object_name = "doc/" + str(object_id) - self.client._abort_multipart_upload( + await self._with_retry( + self.client._abort_multipart_upload, bucket_name=self.bucket_name, object_name=object_name, upload_id=upload_id, diff --git a/trustgraph-flow/trustgraph/librarian/librarian.py b/trustgraph-flow/trustgraph/librarian/librarian.py index 2a8ad3a6..77232650 100644 --- a/trustgraph-flow/trustgraph/librarian/librarian.py +++ b/trustgraph-flow/trustgraph/librarian/librarian.py @@ -301,7 +301,7 @@ class Librarian: object_id = uuid.uuid4() # Create S3 multipart upload - s3_upload_id = self.blob_store.create_multipart_upload( + s3_upload_id = await self.blob_store.create_multipart_upload( object_id, request.document_metadata.kind ) @@ -367,7 +367,7 @@ class Librarian: # Upload to S3 (part numbers are 1-indexed in S3) part_number = request.chunk_index + 1 - etag = self.blob_store.upload_part( + etag = await self.blob_store.upload_part( object_id=session["object_id"], upload_id=session["s3_upload_id"], part_number=part_number, @@ -440,7 +440,7 @@ class Librarian: ] # Complete S3 multipart upload - self.blob_store.complete_multipart_upload( + await self.blob_store.complete_multipart_upload( object_id=session["object_id"], upload_id=session["s3_upload_id"], parts=parts, @@ -492,7 +492,7 @@ class Librarian: raise RequestError("Not authorized to abort this upload") # Abort S3 multipart upload - self.blob_store.abort_multipart_upload( + await self.blob_store.abort_multipart_upload( object_id=session["object_id"], upload_id=session["s3_upload_id"], )