Config push notify pattern: replace stateful pub/sub with signal+ fetch (#760)

Replace the config push mechanism that broadcast the full config
blob on a 'state' class pub/sub queue with a lightweight notify
signal containing only the version number and affected config
types. Processors fetch the full config via request/response from
the config service when notified.

This eliminates the need for the pub/sub 'state' queue class and
stateful pub/sub services entirely. The config push queue moves
from 'state' to 'flow' class — a simple transient signal rather
than a retained message.  This solves the RabbitMQ
late-subscriber problem where restarting processes never received
the current config because their fresh queue had no historical
messages.

Key changes:
- ConfigPush schema: config dict replaced with types list
- Subscribe-then-fetch startup with retry: processors subscribe
  to notify queue, fetch config via request/response, then
  process buffered notifies with version comparison to avoid race
  conditions
- register_config_handler() accepts optional types parameter so
  handlers only fire when their config types change
- Short-lived config request/response clients to avoid subscriber
  contention on non-persistent response topics
- Config service passes affected types through put/delete/flow
  operations
- Gateway ConfigReceiver rewritten with same notify pattern and
  retry loop

Tests updated

New tests:
- register_config_handler: without types, with types, multiple
  types, multiple handlers
- on_config_notify: old/same version skipped, irrelevant types
  skipped (version still updated), relevant type triggers fetch,
  handler without types always called, mixed handler filtering,
  empty types invokes all, fetch failure handled gracefully
- fetch_config: returns config+version, raises on error response,
  stops client even on exception
- fetch_and_apply_config: applies to all handlers on startup,
  retries on failure
This commit is contained in:
cybermaggedon 2026-04-06 16:57:27 +01:00 committed by GitHub
parent d4723566cb
commit 4acd853023
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
37 changed files with 1449 additions and 406 deletions

View file

@ -1,7 +1,8 @@
# Base class for processors. Implements:
# - Pulsar client, subscribe and consume basic
# - Pub/sub client, subscribe and consume basic
# - the async startup logic
# - Config notify handling with subscribe-then-fetch pattern
# - Initialising metrics
import asyncio
@ -12,12 +13,17 @@ import logging
import os
from prometheus_client import start_http_server, Info
from .. schema import ConfigPush, config_push_queue
from .. schema import ConfigPush, ConfigRequest, ConfigResponse
from .. schema import config_push_queue, config_request_queue
from .. schema import config_response_queue
from .. log_level import LogLevel
from . pubsub import get_pubsub, add_pubsub_args
from . producer import Producer
from . consumer import Consumer
from . metrics import ProcessorMetrics, ConsumerMetrics
from . subscriber import Subscriber
from . request_response_spec import RequestResponse
from . metrics import ProcessorMetrics, ConsumerMetrics, ProducerMetrics
from . metrics import SubscriberMetrics
from . logging import add_logging_args, setup_logging
default_config_queue = config_push_queue
@ -57,9 +63,13 @@ class AsyncProcessor:
"config_push_queue", default_config_queue
)
# This records registered configuration handlers
# This records registered configuration handlers, each entry is:
# { "handler": async_fn, "types": set_or_none }
self.config_handlers = []
# Track the current config version for dedup
self.config_version = 0
# Create a random ID for this subscription to the configuration
# service
config_subscriber_id = str(uuid.uuid4())
@ -68,8 +78,7 @@ class AsyncProcessor:
processor = self.id, flow = None, name = "config",
)
# Subscribe to config queue — exclusive so every processor
# gets its own copy of config pushes (broadcast pattern)
# Subscribe to config notify queue
self.config_sub_task = Consumer(
taskgroup = self.taskgroup,
@ -80,21 +89,93 @@ class AsyncProcessor:
topic = self.config_push_queue,
schema = ConfigPush,
handler = self.on_config_change,
handler = self.on_config_notify,
metrics = config_consumer_metrics,
start_of_messages = True,
start_of_messages = False,
consumer_type = 'exclusive',
)
self.running = True
# This is called to start dynamic behaviour. An over-ride point for
# extra functionality
def _create_config_client(self):
"""Create a short-lived config request/response client."""
config_rr_id = str(uuid.uuid4())
config_req_metrics = ProducerMetrics(
processor = self.id, flow = None, name = "config-request",
)
config_resp_metrics = SubscriberMetrics(
processor = self.id, flow = None, name = "config-response",
)
return RequestResponse(
backend = self.pubsub_backend,
subscription = f"{self.id}--config--{config_rr_id}",
consumer_name = self.id,
request_topic = config_request_queue,
request_schema = ConfigRequest,
request_metrics = config_req_metrics,
response_topic = config_response_queue,
response_schema = ConfigResponse,
response_metrics = config_resp_metrics,
)
async def fetch_config(self):
"""Fetch full config from config service using a short-lived
request/response client. Returns (config, version) or raises."""
client = self._create_config_client()
try:
await client.start()
resp = await client.request(
ConfigRequest(operation="config"),
timeout=10,
)
if resp.error:
raise RuntimeError(f"Config error: {resp.error.message}")
return resp.config, resp.version
finally:
await client.stop()
# This is called to start dynamic behaviour.
# Implements the subscribe-then-fetch pattern to avoid race conditions.
async def start(self):
# 1. Start the notify consumer (begins buffering incoming notifys)
await self.config_sub_task.start()
# 2. Fetch current config via request/response
await self.fetch_and_apply_config()
# 3. Any buffered notifys with version > fetched version will be
# processed by on_config_notify, which does the version check
async def fetch_and_apply_config(self):
"""Fetch full config from config service and apply to all handlers.
Retries until successful config service may not be ready yet."""
while self.running:
try:
config, version = await self.fetch_config()
logger.info(f"Fetched config version {version}")
self.config_version = version
# Apply to all handlers (startup = invoke all)
for entry in self.config_handlers:
await entry["handler"](config, version)
return
except Exception as e:
logger.warning(
f"Config fetch failed: {e}, retrying in 2s..."
)
await asyncio.sleep(2)
# This is called to stop all threads. An over-ride point for extra
# functionality
def stop(self):
@ -110,20 +191,66 @@ class AsyncProcessor:
def pulsar_host(self): return self._pulsar_host
# Register a new event handler for configuration change
def register_config_handler(self, handler):
self.config_handlers.append(handler)
def register_config_handler(self, handler, types=None):
self.config_handlers.append({
"handler": handler,
"types": set(types) if types else None,
})
# Called when a new configuration message push occurs
async def on_config_change(self, message, consumer, flow):
# Called when a config notify message arrives
async def on_config_notify(self, message, consumer, flow):
# Get configuration data and version number
config = message.value().config
version = message.value().version
notify_version = message.value().version
notify_types = set(message.value().types)
# Invoke message handlers
logger.info(f"Config change event: version={version}")
for ch in self.config_handlers:
await ch(config, version)
# Skip if we already have this version or newer
if notify_version <= self.config_version:
logger.debug(
f"Ignoring config notify v{notify_version}, "
f"already at v{self.config_version}"
)
return
# Check if any handler cares about the affected types
if notify_types:
any_interested = False
for entry in self.config_handlers:
handler_types = entry["types"]
if handler_types is None or notify_types & handler_types:
any_interested = True
break
if not any_interested:
logger.debug(
f"Ignoring config notify v{notify_version}, "
f"no handlers for types {notify_types}"
)
self.config_version = notify_version
return
logger.info(
f"Config notify v{notify_version} types={list(notify_types)}, "
f"fetching config..."
)
# Fetch full config using short-lived client
try:
config, version = await self.fetch_config()
self.config_version = version
# Invoke handlers that care about the affected types
for entry in self.config_handlers:
handler_types = entry["types"]
if handler_types is None:
await entry["handler"](config, version)
elif not notify_types or notify_types & handler_types:
await entry["handler"](config, version)
except Exception as e:
logger.error(
f"Failed to fetch config on notify: {e}", exc_info=True
)
# This is the 'main' body of the handler. It is a point to override
# if needed. By default does nothing. Processors are implemented
@ -181,7 +308,7 @@ class AsyncProcessor:
prog=ident,
description=doc
)
parser.add_argument(
'--id',
default=ident,
@ -271,4 +398,3 @@ class AsyncProcessor:
default=8000,
help=f'Pulsar host (default: 8000)',
)

View file

@ -26,7 +26,9 @@ class FlowProcessor(AsyncProcessor):
super(FlowProcessor, self).__init__(**params)
# Register configuration handler
self.register_config_handler(self.on_configure_flows)
self.register_config_handler(
self.on_configure_flows, types=["active-flow"]
)
# Initialise flow information state
self.flows = {}