Flow service lifecycle management (#822)

feat: separate flow service from config service with explicit queue
lifecycle management

The flow service is now an independent service that owns the lifecycle
of flow and blueprint queues. System services own their own queues.
Consumers never create queues.

Flow service separation:
- New service at trustgraph-flow/trustgraph/flow/service/
- Uses async ConfigClient (RequestResponse pattern) to talk to config
  service
- Config service stripped of all flow handling

Queue lifecycle management:
- PubSubBackend protocol gains create_queue, delete_queue,
  queue_exists, ensure_queue — all async
- RabbitMQ: implements via pika with asyncio.to_thread internally
- Pulsar: stubs for future admin REST API implementation
- Consumer _connect() no longer creates queues (passive=True for named
  queues)
- System services call ensure_queue on startup
- Flow service creates queues on flow start, deletes on flow stop
- Flow service ensures queues for pre-existing flows on startup

Two-phase flow stop:
- Phase 1: set flow status to "stopping", delete processor config
  entries
- Phase 2: retry queue deletion, then delete flow record

Config restructure:
- active-flow config replaced with processor:{name} types
- Each processor has its own config type, each flow variant is a key
- Flow start/stop use batch put/delete — single config push per
  operation
- FlowProcessor subscribes to its own type only

Blueprint format:
- Processor entries split into topics and parameters dicts
- Flow interfaces use {"flow": "topic"} instead of bare strings
- Specs (ConsumerSpec, ProducerSpec, etc.) read from
  definition["topics"]

Tests updated
This commit is contained in:
cybermaggedon 2026-04-16 17:19:39 +01:00 committed by GitHub
parent 645b6a66fd
commit 9f84891fcc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
31 changed files with 1202 additions and 398 deletions

View file

@ -0,0 +1,2 @@
from . service import *

View file

@ -0,0 +1,2 @@
from . service import *

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from . service import run
if __name__ == '__main__':
run()

View file

@ -0,0 +1,490 @@
from trustgraph.schema import FlowResponse, Error
import asyncio
import json
import logging
# Module logger
logger = logging.getLogger(__name__)
# Queue deletion retry settings
DELETE_RETRIES = 5
DELETE_RETRY_DELAY = 2 # seconds
class FlowConfig:
def __init__(self, config, pubsub):
self.config = config
self.pubsub = pubsub
# Cache for parameter type definitions to avoid repeated lookups
self.param_type_cache = {}
async def resolve_parameters(self, flow_blueprint, user_params):
"""
Resolve parameters by merging user-provided values with defaults.
Args:
flow_blueprint: The flow blueprint definition dict
user_params: User-provided parameters dict (may be None or empty)
Returns:
Complete parameter dict with user values and defaults merged
(all values as strings)
"""
# If the flow blueprint has no parameters section, return user params as-is (stringified)
if "parameters" not in flow_blueprint:
if not user_params:
return {}
# Ensure all values are strings
return {k: str(v) for k, v in user_params.items()}
resolved = {}
flow_params = flow_blueprint["parameters"]
user_params = user_params if user_params else {}
# First pass: resolve parameters with explicit values or defaults
for param_name, param_meta in flow_params.items():
# Check if user provided a value
if param_name in user_params:
# Store as string
resolved[param_name] = str(user_params[param_name])
else:
# Look up the parameter type definition
param_type = param_meta.get("type")
if param_type:
# Check cache first
if param_type not in self.param_type_cache:
try:
# Fetch parameter type definition from config store
type_def = await self.config.get(
"parameter-type", param_type
)
if type_def:
self.param_type_cache[param_type] = json.loads(type_def)
else:
logger.warning(f"Parameter type '{param_type}' not found in config")
self.param_type_cache[param_type] = {}
except Exception as e:
logger.error(f"Error fetching parameter type '{param_type}': {e}")
self.param_type_cache[param_type] = {}
# Apply default from type definition (as string)
type_def = self.param_type_cache[param_type]
if "default" in type_def:
default_value = type_def["default"]
# Convert to string based on type
if isinstance(default_value, bool):
resolved[param_name] = "true" if default_value else "false"
else:
resolved[param_name] = str(default_value)
elif type_def.get("required", False):
# Required parameter with no default and no user value
raise RuntimeError(f"Required parameter '{param_name}' not provided and has no default")
# Second pass: handle controlled-by relationships
for param_name, param_meta in flow_params.items():
if param_name not in resolved and "controlled-by" in param_meta:
controller = param_meta["controlled-by"]
if controller in resolved:
# Inherit value from controlling parameter (already a string)
resolved[param_name] = resolved[controller]
else:
# Controller has no value, try to get default from type definition
param_type = param_meta.get("type")
if param_type and param_type in self.param_type_cache:
type_def = self.param_type_cache[param_type]
if "default" in type_def:
default_value = type_def["default"]
# Convert to string based on type
if isinstance(default_value, bool):
resolved[param_name] = "true" if default_value else "false"
else:
resolved[param_name] = str(default_value)
# Include any extra parameters from user that weren't in flow blueprint definition
# This allows for forward compatibility (ensure they're strings)
for key, value in user_params.items():
if key not in resolved:
resolved[key] = str(value)
return resolved
async def handle_list_blueprints(self, msg):
names = list(await self.config.keys("flow-blueprint"))
return FlowResponse(
error = None,
blueprint_names = names,
)
async def handle_get_blueprint(self, msg):
return FlowResponse(
error = None,
blueprint_definition = await self.config.get(
"flow-blueprint", msg.blueprint_name
),
)
async def handle_put_blueprint(self, msg):
await self.config.put(
"flow-blueprint",
msg.blueprint_name, msg.blueprint_definition
)
return FlowResponse(
error = None,
)
async def handle_delete_blueprint(self, msg):
logger.debug(f"Flow config message: {msg}")
await self.config.delete("flow-blueprint", msg.blueprint_name)
return FlowResponse(
error = None,
)
async def handle_list_flows(self, msg):
names = list(await self.config.keys("flow"))
return FlowResponse(
error = None,
flow_ids = names,
)
async def handle_get_flow(self, msg):
flow_data = await self.config.get("flow", msg.flow_id)
flow = json.loads(flow_data)
return FlowResponse(
error = None,
flow = flow_data,
description = flow.get("description", ""),
parameters = flow.get("parameters", {}),
)
async def handle_start_flow(self, msg):
if msg.blueprint_name is None:
raise RuntimeError("No blueprint name")
if msg.flow_id is None:
raise RuntimeError("No flow ID")
if msg.flow_id in await self.config.keys("flow"):
raise RuntimeError("Flow already exists")
if msg.description is None:
raise RuntimeError("No description")
if msg.blueprint_name not in await self.config.keys("flow-blueprint"):
raise RuntimeError("Blueprint does not exist")
cls = json.loads(
await self.config.get("flow-blueprint", msg.blueprint_name)
)
# Resolve parameters by merging user-provided values with defaults
user_params = msg.parameters if msg.parameters else {}
parameters = await self.resolve_parameters(cls, user_params)
# Log the resolved parameters for debugging
logger.debug(f"User provided parameters: {user_params}")
logger.debug(f"Resolved parameters (with defaults): {parameters}")
# Apply parameter substitution to template replacement function
def repl_template_with_params(tmp):
result = tmp.replace(
"{blueprint}", msg.blueprint_name
).replace(
"{id}", msg.flow_id
)
# Apply parameter substitutions
for param_name, param_value in parameters.items():
result = result.replace(f"{{{param_name}}}", str(param_value))
return result
# Pre-create flow-level queues so the data path is wired
# before processors receive their config and start connecting.
queues = self._collect_flow_queues(cls, repl_template_with_params)
for topic, subscription in queues:
await self.pubsub.create_queue(topic, subscription)
# Build all processor config updates, then write in a single batch.
updates = []
for kind in ("blueprint", "flow"):
for k, v in cls[kind].items():
processor, variant = k.split(":", 1)
variant = repl_template_with_params(variant)
topics = {
repl_template_with_params(k2): repl_template_with_params(v2)
for k2, v2 in v.get("topics", {}).items()
}
params = {
repl_template_with_params(k2): repl_template_with_params(v2)
for k2, v2 in v.get("parameters", {}).items()
}
entry = {
"topics": topics,
"parameters": params,
}
updates.append((
f"processor:{processor}",
variant,
json.dumps(entry),
))
await self.config.put_many(updates)
def repl_interface(i):
return {
k: repl_template_with_params(v)
for k, v in i.items()
}
if "interfaces" in cls:
interfaces = {
k: repl_interface(v)
for k, v in cls["interfaces"].items()
}
else:
interfaces = {}
await self.config.put(
"flow", msg.flow_id,
json.dumps({
"description": msg.description,
"blueprint-name": msg.blueprint_name,
"interfaces": interfaces,
"parameters": parameters,
})
)
return FlowResponse(
error = None,
)
async def ensure_existing_flow_queues(self):
"""Ensure queues exist for all already-running flows.
Called on startup to handle flows that were started before this
version of the flow service was deployed, or before a restart.
"""
flow_ids = await self.config.keys("flow")
for flow_id in flow_ids:
try:
flow_data = await self.config.get("flow", flow_id)
if flow_data is None:
continue
flow = json.loads(flow_data)
blueprint_name = flow.get("blueprint-name")
if blueprint_name is None:
continue
# Skip flows that are mid-shutdown
if flow.get("status") == "stopping":
continue
parameters = flow.get("parameters", {})
blueprint_data = await self.config.get(
"flow-blueprint", blueprint_name
)
if blueprint_data is None:
logger.warning(
f"Blueprint '{blueprint_name}' not found for "
f"flow '{flow_id}', skipping queue creation"
)
continue
cls = json.loads(blueprint_data)
def repl_template(tmp):
result = tmp.replace(
"{blueprint}", blueprint_name
).replace(
"{id}", flow_id
)
for param_name, param_value in parameters.items():
result = result.replace(
f"{{{param_name}}}", str(param_value)
)
return result
queues = self._collect_flow_queues(cls, repl_template)
for topic, subscription in queues:
await self.pubsub.ensure_queue(topic, subscription)
logger.info(
f"Ensured queues for existing flow '{flow_id}'"
)
except Exception as e:
logger.error(
f"Failed to ensure queues for flow '{flow_id}': {e}"
)
def _collect_flow_queues(self, cls, repl_template):
"""Collect (topic, subscription) pairs for all flow-level queues.
Iterates the blueprint's "flow" section and reads only the
"topics" dict from each processor entry.
"""
queues = []
for k, v in cls["flow"].items():
processor, variant = k.split(":", 1)
variant = repl_template(variant)
for spec_name, topic_template in v.get("topics", {}).items():
topic = repl_template(topic_template)
subscription = f"{processor}--{variant}--{spec_name}"
queues.append((topic, subscription))
return queues
async def _delete_queues(self, queues):
"""Delete queues with retries. Best-effort — logs failures but
does not raise."""
for attempt in range(DELETE_RETRIES):
remaining = []
for topic, subscription in queues:
try:
await self.pubsub.delete_queue(topic, subscription)
except Exception as e:
logger.warning(
f"Queue delete failed (attempt {attempt + 1}/"
f"{DELETE_RETRIES}): {topic}: {e}"
)
remaining.append((topic, subscription))
if not remaining:
return
queues = remaining
if attempt < DELETE_RETRIES - 1:
await asyncio.sleep(DELETE_RETRY_DELAY)
for topic, subscription in queues:
logger.error(
f"Failed to delete queue after {DELETE_RETRIES} "
f"attempts: {topic}"
)
async def handle_stop_flow(self, msg):
if msg.flow_id is None:
raise RuntimeError("No flow ID")
if msg.flow_id not in await self.config.keys("flow"):
raise RuntimeError("Flow ID invalid")
flow = json.loads(await self.config.get("flow", msg.flow_id))
if "blueprint-name" not in flow:
raise RuntimeError("Internal error: flow has no flow blueprint")
blueprint_name = flow["blueprint-name"]
parameters = flow.get("parameters", {})
cls = json.loads(
await self.config.get("flow-blueprint", blueprint_name)
)
def repl_template(tmp):
result = tmp.replace(
"{blueprint}", blueprint_name
).replace(
"{id}", msg.flow_id
)
# Apply parameter substitutions
for param_name, param_value in parameters.items():
result = result.replace(f"{{{param_name}}}", str(param_value))
return result
# Collect queue identifiers before removing config
queues = self._collect_flow_queues(cls, repl_template)
# Phase 1: Set status to "stopping" and remove processor config.
# The config push tells processors to shut down their consumers.
flow["status"] = "stopping"
await self.config.put(
"flow", msg.flow_id, json.dumps(flow)
)
# Delete all processor config entries for this flow.
deletes = []
for k, v in cls["flow"].items():
processor, variant = k.split(":", 1)
variant = repl_template(variant)
deletes.append((f"processor:{processor}", variant))
await self.config.delete_many(deletes)
# Phase 2: Delete queues with retries, then remove the flow record.
await self._delete_queues(queues)
if msg.flow_id in await self.config.keys("flow"):
await self.config.delete("flow", msg.flow_id)
return FlowResponse(
error = None,
)
async def handle(self, msg):
logger.debug(f"Handling flow message: {msg.operation}")
if msg.operation == "list-blueprints":
resp = await self.handle_list_blueprints(msg)
elif msg.operation == "get-blueprint":
resp = await self.handle_get_blueprint(msg)
elif msg.operation == "put-blueprint":
resp = await self.handle_put_blueprint(msg)
elif msg.operation == "delete-blueprint":
resp = await self.handle_delete_blueprint(msg)
elif msg.operation == "list-flows":
resp = await self.handle_list_flows(msg)
elif msg.operation == "get-flow":
resp = await self.handle_get_flow(msg)
elif msg.operation == "start-flow":
resp = await self.handle_start_flow(msg)
elif msg.operation == "stop-flow":
resp = await self.handle_stop_flow(msg)
else:
resp = FlowResponse(
error=Error(
type = "bad-operation",
message = "Bad operation"
)
)
return resp

View file

@ -0,0 +1,162 @@
"""
Flow service. Manages flow lifecycle starting and stopping flows
by coordinating with the config service via pub/sub.
"""
import logging
from trustgraph.schema import Error
from trustgraph.schema import FlowRequest, FlowResponse
from trustgraph.schema import flow_request_queue, flow_response_queue
from trustgraph.schema import ConfigRequest, ConfigResponse
from trustgraph.schema import config_request_queue, config_response_queue
from trustgraph.base import AsyncProcessor, Consumer, Producer
from trustgraph.base import ConsumerMetrics, ProducerMetrics, SubscriberMetrics
from trustgraph.base import ConfigClient
from . flow import FlowConfig
# Module logger
logger = logging.getLogger(__name__)
default_ident = "flow-svc"
default_flow_request_queue = flow_request_queue
default_flow_response_queue = flow_response_queue
class Processor(AsyncProcessor):
def __init__(self, **params):
flow_request_queue = params.get(
"flow_request_queue", default_flow_request_queue
)
flow_response_queue = params.get(
"flow_response_queue", default_flow_response_queue
)
id = params.get("id")
super(Processor, self).__init__(
**params | {
"flow_request_schema": FlowRequest.__name__,
"flow_response_schema": FlowResponse.__name__,
}
)
flow_request_metrics = ConsumerMetrics(
processor = self.id, flow = None, name = "flow-request"
)
flow_response_metrics = ProducerMetrics(
processor = self.id, flow = None, name = "flow-response"
)
self.flow_request_topic = flow_request_queue
self.flow_request_subscriber = id
self.flow_request_consumer = Consumer(
taskgroup = self.taskgroup,
backend = self.pubsub,
flow = None,
topic = flow_request_queue,
subscriber = id,
schema = FlowRequest,
handler = self.on_flow_request,
metrics = flow_request_metrics,
)
self.flow_response_producer = Producer(
backend = self.pubsub,
topic = flow_response_queue,
schema = FlowResponse,
metrics = flow_response_metrics,
)
config_req_metrics = ProducerMetrics(
processor=self.id, flow=None, name="config-request",
)
config_resp_metrics = SubscriberMetrics(
processor=self.id, flow=None, name="config-response",
)
self.config_client = ConfigClient(
backend=self.pubsub,
subscription=f"{self.id}--config--{id}",
consumer_name=self.id,
request_topic=config_request_queue,
request_schema=ConfigRequest,
request_metrics=config_req_metrics,
response_topic=config_response_queue,
response_schema=ConfigResponse,
response_metrics=config_resp_metrics,
)
self.flow = FlowConfig(self.config_client, self.pubsub)
logger.info("Flow service initialized")
async def start(self):
await self.pubsub.ensure_queue(
self.flow_request_topic, self.flow_request_subscriber
)
await self.config_client.start()
await self.flow.ensure_existing_flow_queues()
await self.flow_request_consumer.start()
async def on_flow_request(self, msg, consumer, flow):
try:
v = msg.value()
# Sender-produced ID
id = msg.properties()["id"]
logger.debug(f"Handling flow request {id}...")
resp = await self.flow.handle(v)
await self.flow_response_producer.send(
resp, properties={"id": id}
)
except Exception as e:
logger.error(f"Flow request failed: {e}")
resp = FlowResponse(
error=Error(
type = "flow-error",
message = str(e),
),
)
await self.flow_response_producer.send(
resp, properties={"id": id}
)
@staticmethod
def add_args(parser):
AsyncProcessor.add_args(parser)
parser.add_argument(
'--flow-request-queue',
default=default_flow_request_queue,
help=f'Flow request queue (default: {default_flow_request_queue})'
)
parser.add_argument(
'--flow-response-queue',
default=default_flow_response_queue,
help=f'Flow response queue {default_flow_response_queue}',
)
def run():
Processor.launch(default_ident, __doc__)