refactor: use one fanout exchange per topic instead of shared topic exchange (#827)

The RabbitMQ backend used a single topic exchange per topicspace
with routing keys to differentiate logical topics. This meant the
flow service had to manually create named queues for every
processor-topic pair, including producer-side topics — creating
phantom queues that accumulated unread message copies indefinitely.

Replace with one fanout exchange per logical topic. Consumers now
declare and bind their own queues on connect. The flow service
manages topic lifecycle (create/delete exchanges) rather than queue
lifecycle, and only collects unique topic identifiers instead of
per-processor (topic, subscription) pairs.

Backend API: create_queue/delete_queue/ensure_queue replaced with
create_topic/delete_topic/ensure_topic (subscription parameter
removed).
This commit is contained in:
cybermaggedon 2026-04-17 18:01:35 +01:00 committed by GitHub
parent 391b9076f3
commit 3505bfdd25
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 190 additions and 228 deletions

View file

@ -124,9 +124,7 @@ class Processor(AsyncProcessor):
async def start(self):
await self.pubsub.ensure_queue(
self.config_request_topic, self.config_request_subscriber
)
await self.pubsub.ensure_topic(self.config_request_topic)
await self.push() # Startup poke: empty types = everything
await self.config_request_consumer.start()

View file

@ -119,9 +119,7 @@ class Processor(AsyncProcessor):
async def start(self):
await self.pubsub.ensure_queue(
self.knowledge_request_topic, self.knowledge_request_subscriber
)
await self.pubsub.ensure_topic(self.knowledge_request_topic)
await super(Processor, self).start()
await self.knowledge_request_consumer.start()
await self.knowledge_response_producer.start()

View file

@ -7,7 +7,7 @@ import logging
# Module logger
logger = logging.getLogger(__name__)
# Queue deletion retry settings
# Topic deletion retry settings
DELETE_RETRIES = 5
DELETE_RETRY_DELAY = 2 # seconds
@ -215,11 +215,11 @@ class FlowConfig:
return result
# Pre-create flow-level queues so the data path is wired
# Pre-create topic exchanges so the data path is wired
# before processors receive their config and start connecting.
queues = self._collect_flow_queues(cls, repl_template_with_params)
for topic, subscription in queues:
await self.pubsub.create_queue(topic, subscription)
topics = self._collect_flow_topics(cls, repl_template_with_params)
for topic in topics:
await self.pubsub.create_topic(topic)
# Build all processor config updates, then write in a single batch.
updates = []
@ -283,8 +283,8 @@ class FlowConfig:
error = None,
)
async def ensure_existing_flow_queues(self):
"""Ensure queues exist for all already-running flows.
async def ensure_existing_flow_topics(self):
"""Ensure topics exist for all already-running flows.
Called on startup to handle flows that were started before this
version of the flow service was deployed, or before a restart.
@ -315,7 +315,7 @@ class FlowConfig:
if blueprint_data is None:
logger.warning(
f"Blueprint '{blueprint_name}' not found for "
f"flow '{flow_id}', skipping queue creation"
f"flow '{flow_id}', skipping topic creation"
)
continue
@ -333,65 +333,63 @@ class FlowConfig:
)
return result
queues = self._collect_flow_queues(cls, repl_template)
for topic, subscription in queues:
await self.pubsub.ensure_queue(topic, subscription)
topics = self._collect_flow_topics(cls, repl_template)
for topic in topics:
await self.pubsub.ensure_topic(topic)
logger.info(
f"Ensured queues for existing flow '{flow_id}'"
f"Ensured topics for existing flow '{flow_id}'"
)
except Exception as e:
logger.error(
f"Failed to ensure queues for flow '{flow_id}': {e}"
f"Failed to ensure topics for flow '{flow_id}': {e}"
)
def _collect_flow_queues(self, cls, repl_template):
"""Collect (topic, subscription) pairs for all flow-level queues.
def _collect_flow_topics(self, cls, repl_template):
"""Collect unique topic identifiers from the blueprint.
Iterates the blueprint's "flow" section and reads only the
"topics" dict from each processor entry.
Iterates the blueprint's "flow" section and returns a
deduplicated set of resolved topic strings. The flow service
manages topic lifecycle (create/delete exchanges), not
individual consumer queues.
"""
queues = []
topics = set()
for k, v in cls["flow"].items():
processor, variant = k.split(":", 1)
variant = repl_template(variant)
for spec_name, topic_template in v.get("topics", {}).items():
topic = repl_template(topic_template)
subscription = f"{processor}--{variant}--{spec_name}"
queues.append((topic, subscription))
topics.add(topic)
return queues
return topics
async def _delete_queues(self, queues):
"""Delete queues with retries. Best-effort — logs failures but
async def _delete_topics(self, topics):
"""Delete topics with retries. Best-effort — logs failures but
does not raise."""
for attempt in range(DELETE_RETRIES):
remaining = []
for topic, subscription in queues:
for topic in topics:
try:
await self.pubsub.delete_queue(topic, subscription)
await self.pubsub.delete_topic(topic)
except Exception as e:
logger.warning(
f"Queue delete failed (attempt {attempt + 1}/"
f"Topic delete failed (attempt {attempt + 1}/"
f"{DELETE_RETRIES}): {topic}: {e}"
)
remaining.append((topic, subscription))
remaining.append(topic)
if not remaining:
return
queues = remaining
topics = remaining
if attempt < DELETE_RETRIES - 1:
await asyncio.sleep(DELETE_RETRY_DELAY)
for topic, subscription in queues:
for topic in topics:
logger.error(
f"Failed to delete queue after {DELETE_RETRIES} "
f"Failed to delete topic after {DELETE_RETRIES} "
f"attempts: {topic}"
)
@ -426,8 +424,8 @@ class FlowConfig:
result = result.replace(f"{{{param_name}}}", str(param_value))
return result
# Collect queue identifiers before removing config
queues = self._collect_flow_queues(cls, repl_template)
# Collect topic identifiers before removing config
topics = self._collect_flow_topics(cls, repl_template)
# Phase 1: Set status to "stopping" and remove processor config.
# The config push tells processors to shut down their consumers.
@ -448,8 +446,8 @@ class FlowConfig:
await self.config.delete_many(deletes)
# Phase 2: Delete queues with retries, then remove the flow record.
await self._delete_queues(queues)
# Phase 2: Delete topics with retries, then remove the flow record.
await self._delete_topics(topics)
if msg.flow_id in await self.config.keys("flow"):
await self.config.delete("flow", msg.flow_id)

View file

@ -101,11 +101,9 @@ class Processor(AsyncProcessor):
async def start(self):
await self.pubsub.ensure_queue(
self.flow_request_topic, self.flow_request_subscriber
)
await self.pubsub.ensure_topic(self.flow_request_topic)
await self.config_client.start()
await self.flow.ensure_existing_flow_queues()
await self.flow.ensure_existing_flow_topics()
await self.flow_request_consumer.start()
async def on_flow_request(self, msg, consumer, flow):

View file

@ -263,12 +263,8 @@ class Processor(AsyncProcessor):
async def start(self):
await self.pubsub.ensure_queue(
self.librarian_request_topic, self.librarian_request_subscriber
)
await self.pubsub.ensure_queue(
self.collection_request_topic, self.collection_request_subscriber
)
await self.pubsub.ensure_topic(self.librarian_request_topic)
await self.pubsub.ensure_topic(self.collection_request_topic)
await super(Processor, self).start()
await self.librarian_request_consumer.start()
await self.librarian_response_producer.start()