Merge branch 'release/v2.4'

2026-05-03 04:12:37 +02:00 · 2026-04-29 17:56:48 +01:00 · 2026-04-29 17:56:48 +01:00 · f3434307c5
commit f3434307c5
parent 627cb1e0d8 d0850ff381
91 changed files with 10657 additions and 1218 deletions
--- a/trustgraph-flow/pyproject.toml
+++ b/trustgraph-flow/pyproject.toml
@ -60,8 +60,10 @@ agent-orchestrator = "trustgraph.agent.orchestrator:run"
 api-gateway = "trustgraph.gateway:run"
 chunker-recursive = "trustgraph.chunking.recursive:run"
 chunker-token = "trustgraph.chunking.token:run"
+bootstrap = "trustgraph.bootstrap.bootstrapper:run"
 config-svc = "trustgraph.config.service:run"
 flow-svc = "trustgraph.flow.service:run"
+iam-svc = "trustgraph.iam.service:run"
 doc-embeddings-query-milvus = "trustgraph.query.doc_embeddings.milvus:run"
 doc-embeddings-query-pinecone = "trustgraph.query.doc_embeddings.pinecone:run"
 doc-embeddings-query-qdrant = "trustgraph.query.doc_embeddings.qdrant:run"
--- a/trustgraph-flow/trustgraph/bootstrap/init.py
+++ b/trustgraph-flow/trustgraph/bootstrap/init.py
--- a/trustgraph-flow/trustgraph/bootstrap/base.py
+++ b/trustgraph-flow/trustgraph/bootstrap/base.py
@ -0,0 +1,68 @@
+"""
+Bootstrap framework: Initialiser base class and per-wake context.
+
+See docs/tech-specs/bootstrap.md for the full design.
+"""
+
+import logging
+from dataclasses import dataclass
+from typing import Any
+
+
+@dataclass
+class InitContext:
+    """Shared per-wake context passed to each initialiser.
+
+    The bootstrapper constructs one of these on every wake cycle,
+    tears it down at cycle end, and passes it into each initialiser's
+    ``run()`` method.  Fields are short-lived and safe to use during
+    a single cycle only.
+    """
+
+    logger: logging.Logger
+    config: Any    # ConfigClient
+    flow: Any      # RequestResponse client for flow-svc
+
+
+class Initialiser:
+    """Base class for bootstrap initialisers.
+
+    Subclasses implement :meth:`run`.  The bootstrapper manages
+    completion state, flag comparison, retry and error handling —
+    subclasses describe only the work to perform.
+
+    Class attributes:
+
+    * ``wait_for_services`` (bool, default ``True``): when ``True`` the
+      initialiser only runs after the bootstrapper's service gate has
+      passed (config-svc and flow-svc reachable).  Set ``False`` for
+      initialisers that bring up infrastructure the gate itself
+      depends on — principally Pulsar topology, without which
+      config-svc cannot come online.
+    """
+
+    wait_for_services: bool = True
+
+    def __init__(self, **params):
+        # Subclasses should consume their own params via keyword
+        # arguments in their own __init__ signatures.  This catch-all
+        # is here so any kwargs that filter through unnoticed don't
+        # raise TypeError on construction.
+        pass
+
+    async def run(self, ctx, old_flag, new_flag):
+        """Perform initialisation work.
+
+        :param ctx: :class:`InitContext` with logger, config client,
+            flow-svc client.
+        :param old_flag: Previously-stored flag string, or ``None`` if
+            this initialiser has never successfully completed in this
+            deployment.
+        :param new_flag: Currently-configured flag.  A string chosen
+            by the operator; typically something like ``"v1"``.
+
+        :raises: Any exception on failure.  The bootstrapper catches,
+            logs, and re-runs on the next cycle; completion state is
+            only written on clean return.
+        """
+        raise NotImplementedError
--- a/trustgraph-flow/trustgraph/bootstrap/bootstrapper/init.py
+++ b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/init.py
@ -0,0 +1 @@
+from . service import *
--- a/trustgraph-flow/trustgraph/bootstrap/bootstrapper/main.py
+++ b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/main.py
@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+
+from . service import run
+
+if __name__ == '__main__':
+    run()
--- a/trustgraph-flow/trustgraph/bootstrap/bootstrapper/service.py
+++ b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/service.py
@ -0,0 +1,414 @@
+"""
+Bootstrapper processor.
+
+Runs a pluggable list of initialisers in a reconciliation loop.
+Each initialiser's completion state is recorded in the reserved
+``__system__`` workspace under the ``init-state`` config type.
+
+See docs/tech-specs/bootstrap.md for the full design.
+"""
+
+import asyncio
+import importlib
+import json
+import logging
+import uuid
+from argparse import ArgumentParser
+from dataclasses import dataclass
+
+from trustgraph.base import AsyncProcessor
+from trustgraph.base import ProducerMetrics, SubscriberMetrics
+from trustgraph.base.config_client import ConfigClient
+from trustgraph.base.request_response_spec import RequestResponse
+from trustgraph.schema import (
+    ConfigRequest, ConfigResponse,
+    config_request_queue, config_response_queue,
+)
+from trustgraph.schema import (
+    FlowRequest, FlowResponse,
+    flow_request_queue, flow_response_queue,
+)
+
+from .. base import Initialiser, InitContext
+
+logger = logging.getLogger(__name__)
+
+default_ident = "bootstrap"
+
+# Reserved workspace + config type under which completion state is
+# stored.  Reserved (`_`-prefix) workspaces are excluded from the
+# config push broadcast — live processors never see these keys.
+SYSTEM_WORKSPACE = "__system__"
+INIT_STATE_TYPE = "init-state"
+
+# Cadence tiers.
+GATE_BACKOFF = 5           # Services not responding; retry soon.
+INIT_RETRY = 15            # Gate passed but something ran/failed;
+                           # converge quickly.
+STEADY_INTERVAL = 300      # Everything at target flag; idle cheaply.
+
+
+@dataclass
+class InitialiserSpec:
+    """One entry in the bootstrapper's configured list of initialisers."""
+    name: str
+    flag: str
+    instance: Initialiser
+
+
+def _resolve_class(dotted):
+    """Import and return a class by its dotted path."""
+    module_path, _, class_name = dotted.rpartition(".")
+    if not module_path:
+        raise ValueError(
+            f"Initialiser class must be a dotted path, got {dotted!r}"
+        )
+    module = importlib.import_module(module_path)
+    return getattr(module, class_name)
+
+
+def _load_initialisers_file(path):
+    """Load the initialisers spec list from a YAML or JSON file.
+
+    File shape:
+
+    .. code-block:: yaml
+
+        initialisers:
+          - class: trustgraph.bootstrap.initialisers.PulsarTopology
+            name: pulsar-topology
+            flag: v1
+            params:
+              admin_url: http://pulsar:8080
+              tenant: tg
+          - ...
+    """
+    with open(path) as f:
+        content = f.read()
+    if path.endswith((".yaml", ".yml")):
+        import yaml
+        doc = yaml.safe_load(content)
+    else:
+        doc = json.loads(content)
+    if not isinstance(doc, dict) or "initialisers" not in doc:
+        raise RuntimeError(
+            f"{path}: expected a mapping with an 'initialisers' key"
+        )
+    return doc["initialisers"]
+
+
+class Processor(AsyncProcessor):
+
+    def __init__(self, **params):
+
+        super().__init__(**params)
+
+        # Source the initialisers list either from a direct parameter
+        # (processor-group embedding) or from a file (CLI launch).
+        inits = params.get("initialisers")
+        if inits is None:
+            inits_file = params.get("initialisers_file")
+            if inits_file is None:
+                raise RuntimeError(
+                    "Bootstrapper requires either the 'initialisers' "
+                    "parameter or --initialisers-file"
+                )
+            inits = _load_initialisers_file(inits_file)
+
+        self.specs = []
+        names = set()
+
+        for entry in inits:
+            if not isinstance(entry, dict):
+                raise RuntimeError(
+                    f"Initialiser entry must be a mapping, got: {entry!r}"
+                )
+            for required in ("class", "name", "flag"):
+                if required not in entry:
+                    raise RuntimeError(
+                        f"Initialiser entry missing required field "
+                        f"{required!r}: {entry!r}"
+                    )
+
+            name = entry["name"]
+            if name in names:
+                raise RuntimeError(f"Duplicate initialiser name {name!r}")
+            names.add(name)
+
+            cls = _resolve_class(entry["class"])
+
+            try:
+                instance = cls(**entry.get("params", {}))
+            except Exception as e:
+                raise RuntimeError(
+                    f"Failed to instantiate initialiser "
+                    f"{entry['class']!r} as {name!r}: "
+                    f"{type(e).__name__}: {e}"
+                )
+
+            self.specs.append(InitialiserSpec(
+                name=name,
+                flag=entry["flag"],
+                instance=instance,
+            ))
+
+        logger.info(
+            f"Bootstrapper: loaded {len(self.specs)} initialisers"
+        )
+
+    # ------------------------------------------------------------------
+    # Client construction (short-lived per wake cycle).
+    # ------------------------------------------------------------------
+
+    def _make_config_client(self):
+        rr_id = str(uuid.uuid4())
+        return ConfigClient(
+            backend=self.pubsub_backend,
+            subscription=f"{self.id}--config--{rr_id}",
+            consumer_name=self.id,
+            request_topic=config_request_queue,
+            request_schema=ConfigRequest,
+            request_metrics=ProducerMetrics(
+                processor=self.id, flow=None, name="config-request",
+            ),
+            response_topic=config_response_queue,
+            response_schema=ConfigResponse,
+            response_metrics=SubscriberMetrics(
+                processor=self.id, flow=None, name="config-response",
+            ),
+        )
+
+    def _make_flow_client(self):
+        rr_id = str(uuid.uuid4())
+        return RequestResponse(
+            backend=self.pubsub_backend,
+            subscription=f"{self.id}--flow--{rr_id}",
+            consumer_name=self.id,
+            request_topic=flow_request_queue,
+            request_schema=FlowRequest,
+            request_metrics=ProducerMetrics(
+                processor=self.id, flow=None, name="flow-request",
+            ),
+            response_topic=flow_response_queue,
+            response_schema=FlowResponse,
+            response_metrics=SubscriberMetrics(
+                processor=self.id, flow=None, name="flow-response",
+            ),
+        )
+
+    async def _open_clients(self):
+        config = self._make_config_client()
+        flow = self._make_flow_client()
+        await config.start()
+        try:
+            await flow.start()
+        except Exception:
+            await self._safe_stop(config)
+            raise
+        return config, flow
+
+    async def _safe_stop(self, client):
+        try:
+            await client.stop()
+        except Exception:
+            pass
+
+    # ------------------------------------------------------------------
+    # Service gate.
+    # ------------------------------------------------------------------
+
+    async def _gate_ready(self, config, flow):
+        try:
+            await config.keys(SYSTEM_WORKSPACE, INIT_STATE_TYPE)
+        except Exception as e:
+            logger.info(
+                f"Gate: config-svc not ready ({type(e).__name__}: {e})"
+            )
+            return False
+
+        try:
+            resp = await flow.request(
+                FlowRequest(
+                    operation="list-blueprints",
+                    workspace=SYSTEM_WORKSPACE,
+                ),
+                timeout=5,
+            )
+            if resp.error:
+                logger.info(
+                    f"Gate: flow-svc error: "
+                    f"{resp.error.type}: {resp.error.message}"
+                )
+                return False
+        except Exception as e:
+            logger.info(
+                f"Gate: flow-svc not ready ({type(e).__name__}: {e})"
+            )
+            return False
+
+        return True
+
+    # ------------------------------------------------------------------
+    # Completion state.
+    # ------------------------------------------------------------------
+
+    async def _stored_flag(self, config, name):
+        raw = await config.get(SYSTEM_WORKSPACE, INIT_STATE_TYPE, name)
+        if raw is None:
+            return None
+        try:
+            return json.loads(raw)
+        except Exception:
+            return raw
+
+    async def _store_flag(self, config, name, flag):
+        await config.put(
+            SYSTEM_WORKSPACE, INIT_STATE_TYPE, name,
+            json.dumps(flag),
+        )
+
+    # ------------------------------------------------------------------
+    # Per-spec execution.
+    # ------------------------------------------------------------------
+
+    async def _run_spec(self, spec, config, flow):
+        """Run a single initialiser spec.
+
+        Returns one of:
+          - ``"skip"``: stored flag already matches target, nothing to do.
+          - ``"ran"``: initialiser ran and completion state was updated.
+          - ``"failed"``: initialiser raised.
+          - ``"failed-state-write"``: initialiser succeeded but we could
+            not persist the new flag (transient — will re-run next cycle).
+        """
+
+        try:
+            old_flag = await self._stored_flag(config, spec.name)
+        except Exception as e:
+            logger.warning(
+                f"{spec.name}: could not read stored flag "
+                f"({type(e).__name__}: {e})"
+            )
+            return "failed"
+
+        if old_flag == spec.flag:
+            return "skip"
+
+        child_logger = logger.getChild(spec.name)
+        child_ctx = InitContext(
+            logger=child_logger,
+            config=config,
+            flow=flow,
+        )
+
+        child_logger.info(
+            f"Running (old_flag={old_flag!r} -> new_flag={spec.flag!r})"
+        )
+
+        try:
+            await spec.instance.run(child_ctx, old_flag, spec.flag)
+        except Exception as e:
+            child_logger.error(
+                f"Failed: {type(e).__name__}: {e}", exc_info=True,
+            )
+            return "failed"
+
+        try:
+            await self._store_flag(config, spec.name, spec.flag)
+        except Exception as e:
+            child_logger.warning(
+                f"Completed but could not persist state flag "
+                f"({type(e).__name__}: {e}); will re-run next cycle"
+            )
+            return "failed-state-write"
+
+        child_logger.info(f"Completed (flag={spec.flag!r})")
+        return "ran"
+
+    # ------------------------------------------------------------------
+    # Main loop.
+    # ------------------------------------------------------------------
+
+    async def run(self):
+
+        logger.info(
+            f"Bootstrapper starting with {len(self.specs)} initialisers"
+        )
+
+        while self.running:
+
+            sleep_for = STEADY_INTERVAL
+
+            try:
+                config, flow = await self._open_clients()
+            except Exception as e:
+                logger.info(
+                    f"Failed to open clients "
+                    f"({type(e).__name__}: {e}); retry in {GATE_BACKOFF}s"
+                )
+                await asyncio.sleep(GATE_BACKOFF)
+                continue
+
+            try:
+                # Phase 1: pre-service initialisers run unconditionally.
+                pre_specs = [
+                    s for s in self.specs
+                    if not s.instance.wait_for_services
+                ]
+                pre_results = {}
+                for spec in pre_specs:
+                    pre_results[spec.name] = await self._run_spec(
+                        spec, config, flow,
+                    )
+
+                # Phase 2: gate.
+                gate_ok = await self._gate_ready(config, flow)
+
+                # Phase 3: post-service initialisers, if gate passed.
+                post_results = {}
+                if gate_ok:
+                    post_specs = [
+                        s for s in self.specs
+                        if s.instance.wait_for_services
+                    ]
+                    for spec in post_specs:
+                        post_results[spec.name] = await self._run_spec(
+                            spec, config, flow,
+                        )
+
+                # Cadence selection.
+                if not gate_ok:
+                    sleep_for = GATE_BACKOFF
+                else:
+                    all_results = {**pre_results, **post_results}
+                    if any(r != "skip" for r in all_results.values()):
+                        sleep_for = INIT_RETRY
+                    else:
+                        sleep_for = STEADY_INTERVAL
+
+            finally:
+                await self._safe_stop(config)
+                await self._safe_stop(flow)
+
+            await asyncio.sleep(sleep_for)
+
+    # ------------------------------------------------------------------
+    # CLI arg plumbing.
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def add_args(parser: ArgumentParser) -> None:
+
+        AsyncProcessor.add_args(parser)
+
+        parser.add_argument(
+            '-c', '--initialisers-file',
+            help='Path to YAML or JSON file describing the '
+                 'initialisers to run.  Ignored when the '
+                 "'initialisers' parameter is provided directly "
+                 '(e.g. when running inside a processor group).',
+        )
+
+
+def run():
+    Processor.launch(default_ident, __doc__)
--- a/trustgraph-flow/trustgraph/bootstrap/initialisers/init.py
+++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/init.py
@ -0,0 +1,20 @@
+"""
+Core bootstrap initialisers.
+
+These cover the base TrustGraph deployment case.  Enterprise or
+third-party initialisers live in their own packages and are
+referenced in the bootstrapper's config by fully-qualified dotted
+path.
+"""
+
+from . pulsar_topology import PulsarTopology
+from . template_seed import TemplateSeed
+from . workspace_init import WorkspaceInit
+from . default_flow_start import DefaultFlowStart
+
+__all__ = [
+    "PulsarTopology",
+    "TemplateSeed",
+    "WorkspaceInit",
+    "DefaultFlowStart",
+]
--- a/trustgraph-flow/trustgraph/bootstrap/initialisers/default_flow_start.py
+++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/default_flow_start.py
@ -0,0 +1,101 @@
+"""
+DefaultFlowStart initialiser — starts a named flow in a workspace
+using a specified blueprint.
+
+Separated from WorkspaceInit so deployments that want a workspace
+without an auto-started flow can simply omit this initialiser.
+
+Parameters
+----------
+workspace : str (default "default")
+    Workspace in which to start the flow.
+flow_id : str (default "default")
+    Identifier for the started flow.
+blueprint : str (required)
+    Blueprint name (must already exist in the workspace's config,
+    typically via TemplateSeed -> WorkspaceInit).
+description : str (default "Default")
+    Human-readable description passed to flow-svc.
+parameters : dict (optional)
+    Optional parameter overrides passed to start-flow.
+"""
+
+from trustgraph.schema import FlowRequest
+
+from .. base import Initialiser
+
+
+class DefaultFlowStart(Initialiser):
+
+    def __init__(
+            self,
+            workspace="default",
+            flow_id="default",
+            blueprint=None,
+            description="Default",
+            parameters=None,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if not blueprint:
+            raise ValueError(
+                "DefaultFlowStart requires 'blueprint'"
+            )
+        self.workspace = workspace
+        self.flow_id = flow_id
+        self.blueprint = blueprint
+        self.description = description
+        self.parameters = dict(parameters) if parameters else {}
+
+    async def run(self, ctx, old_flag, new_flag):
+
+        # Check whether the flow already exists.  Belt-and-braces
+        # beyond the flag gate: if an operator stops and restarts the
+        # bootstrapper after the flow is already running, we don't
+        # want to blindly try to start it again.
+        list_resp = await ctx.flow.request(
+            FlowRequest(
+                operation="list-flows",
+                workspace=self.workspace,
+            ),
+            timeout=10,
+        )
+        if list_resp.error:
+            raise RuntimeError(
+                f"list-flows failed: "
+                f"{list_resp.error.type}: {list_resp.error.message}"
+            )
+
+        if self.flow_id in (list_resp.flow_ids or []):
+            ctx.logger.info(
+                f"Flow {self.flow_id!r} already running in workspace "
+                f"{self.workspace!r}; nothing to do"
+            )
+            return
+
+        ctx.logger.info(
+            f"Starting flow {self.flow_id!r} "
+            f"(blueprint={self.blueprint!r}) "
+            f"in workspace {self.workspace!r}"
+        )
+
+        resp = await ctx.flow.request(
+            FlowRequest(
+                operation="start-flow",
+                workspace=self.workspace,
+                flow_id=self.flow_id,
+                blueprint_name=self.blueprint,
+                description=self.description,
+                parameters=self.parameters,
+            ),
+            timeout=30,
+        )
+        if resp.error:
+            raise RuntimeError(
+                f"start-flow failed: "
+                f"{resp.error.type}: {resp.error.message}"
+            )
+
+        ctx.logger.info(
+            f"Flow {self.flow_id!r} started"
+        )
--- a/trustgraph-flow/trustgraph/bootstrap/initialisers/pulsar_topology.py
+++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/pulsar_topology.py
@ -0,0 +1,131 @@
+"""
+PulsarTopology initialiser — creates Pulsar tenant and namespaces
+with their retention policies.
+
+Runs pre-gate (``wait_for_services = False``) because config-svc and
+flow-svc can't connect to Pulsar until these namespaces exist.
+Admin-API calls are idempotent so re-runs on flag change are safe.
+"""
+
+import asyncio
+import requests
+
+from .. base import Initialiser
+
+# Namespace configs.  flow/request take broker defaults.  response
+# and notify get aggressive retention — those classes carry short-lived
+# request/response and notification traffic only.
+NAMESPACE_CONFIG = {
+    "flow": {},
+    "request": {},
+    "response": {
+        "retention_policies": {
+            "retentionSizeInMB": -1,
+            "retentionTimeInMinutes": 3,
+            "subscriptionExpirationTimeMinutes": 30,
+        },
+    },
+    "notify": {
+        "retention_policies": {
+            "retentionSizeInMB": -1,
+            "retentionTimeInMinutes": 3,
+            "subscriptionExpirationTimeMinutes": 5,
+        },
+    },
+}
+
+REQUEST_TIMEOUT = 10
+
+
+class PulsarTopology(Initialiser):
+
+    wait_for_services = False
+
+    def __init__(
+            self,
+            admin_url="http://pulsar:8080",
+            tenant="tg",
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.admin_url = admin_url.rstrip("/")
+        self.tenant = tenant
+
+    async def run(self, ctx, old_flag, new_flag):
+        # requests is blocking; offload to executor so the loop stays
+        # responsive.
+        loop = asyncio.get_event_loop()
+        await loop.run_in_executor(None, self._reconcile_sync, ctx.logger)
+
+    # ------------------------------------------------------------------
+    # Sync admin-API calls.
+    # ------------------------------------------------------------------
+
+    def _get_clusters(self):
+        resp = requests.get(
+            f"{self.admin_url}/admin/v2/clusters",
+            timeout=REQUEST_TIMEOUT,
+        )
+        resp.raise_for_status()
+        return resp.json()
+
+    def _tenant_exists(self):
+        resp = requests.get(
+            f"{self.admin_url}/admin/v2/tenants/{self.tenant}",
+            timeout=REQUEST_TIMEOUT,
+        )
+        return resp.status_code == 200
+
+    def _create_tenant(self, clusters):
+        resp = requests.put(
+            f"{self.admin_url}/admin/v2/tenants/{self.tenant}",
+            json={"adminRoles": [], "allowedClusters": clusters},
+            timeout=REQUEST_TIMEOUT,
+        )
+        if resp.status_code != 204:
+            raise RuntimeError(
+                f"Tenant {self.tenant!r} create failed: "
+                f"{resp.status_code} {resp.text}"
+            )
+
+    def _namespace_exists(self, namespace):
+        resp = requests.get(
+            f"{self.admin_url}/admin/v2/namespaces/"
+            f"{self.tenant}/{namespace}",
+            timeout=REQUEST_TIMEOUT,
+        )
+        return resp.status_code == 200
+
+    def _create_namespace(self, namespace, config):
+        resp = requests.put(
+            f"{self.admin_url}/admin/v2/namespaces/"
+            f"{self.tenant}/{namespace}",
+            json=config,
+            timeout=REQUEST_TIMEOUT,
+        )
+        if resp.status_code != 204:
+            raise RuntimeError(
+                f"Namespace {self.tenant}/{namespace} create failed: "
+                f"{resp.status_code} {resp.text}"
+            )
+
+    def _reconcile_sync(self, logger):
+        if not self._tenant_exists():
+            clusters = self._get_clusters()
+            logger.info(
+                f"Creating tenant {self.tenant!r} with clusters {clusters}"
+            )
+            self._create_tenant(clusters)
+        else:
+            logger.debug(f"Tenant {self.tenant!r} already exists")
+
+        for namespace, config in NAMESPACE_CONFIG.items():
+            if self._namespace_exists(namespace):
+                logger.debug(
+                    f"Namespace {self.tenant}/{namespace} already exists"
+                )
+                continue
+            logger.info(
+                f"Creating namespace {self.tenant}/{namespace}"
+            )
+            self._create_namespace(namespace, config)
--- a/trustgraph-flow/trustgraph/bootstrap/initialisers/template_seed.py
+++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/template_seed.py
@ -0,0 +1,93 @@
+"""
+TemplateSeed initialiser — populates the reserved ``__template__``
+workspace from an external JSON seed file.
+
+Seed file shape:
+
+.. code-block:: json
+
+    {
+        "flow-blueprint": {
+            "ontology": { ... },
+            "agent":    { ... }
+        },
+        "prompt": {
+            ...
+        },
+        ...
+    }
+
+Top-level keys are config types; nested keys are config entries.
+Values are arbitrary JSON (they'll be ``json.dumps()``'d on write).
+
+Parameters
+----------
+config_file : str
+    Path to the seed file on disk.
+overwrite : bool (default False)
+    On re-run (flag change), if True overwrite all keys; if False
+    upsert-missing-only (preserves any operator customisation of
+    the template).
+"""
+
+import json
+
+from .. base import Initialiser
+
+TEMPLATE_WORKSPACE = "__template__"
+
+
+class TemplateSeed(Initialiser):
+
+    def __init__(self, config_file, overwrite=False, **kwargs):
+        super().__init__(**kwargs)
+        if not config_file:
+            raise ValueError("TemplateSeed requires 'config_file'")
+        self.config_file = config_file
+        self.overwrite = overwrite
+
+    async def run(self, ctx, old_flag, new_flag):
+
+        with open(self.config_file) as f:
+            seed = json.load(f)
+
+        if old_flag is None:
+            # Clean first run — write every entry.
+            await self._write_all(ctx, seed)
+            return
+
+        # Re-run after flag change.
+        if self.overwrite:
+            await self._write_all(ctx, seed)
+        else:
+            await self._upsert_missing(ctx, seed)
+
+    async def _write_all(self, ctx, seed):
+        values = []
+        for type_name, entries in seed.items():
+            for key, value in entries.items():
+                values.append((type_name, key, json.dumps(value)))
+        if values:
+            await ctx.config.put_many(TEMPLATE_WORKSPACE, values)
+        ctx.logger.info(
+            f"Template seeded with {len(values)} entries"
+        )
+
+    async def _upsert_missing(self, ctx, seed):
+        written = 0
+        for type_name, entries in seed.items():
+            existing = set(
+                await ctx.config.keys(TEMPLATE_WORKSPACE, type_name)
+            )
+            values = []
+            for key, value in entries.items():
+                if key not in existing:
+                    values.append(
+                        (type_name, key, json.dumps(value))
+                    )
+            if values:
+                await ctx.config.put_many(TEMPLATE_WORKSPACE, values)
+                written += len(values)
+        ctx.logger.info(
+            f"Template upsert-missing: {written} new entries"
+        )
--- a/trustgraph-flow/trustgraph/bootstrap/initialisers/workspace_init.py
+++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/workspace_init.py
@ -0,0 +1,138 @@
+"""
+WorkspaceInit initialiser — creates a workspace and populates it from
+either the ``__template__`` workspace or a seed file on disk.
+
+Parameters
+----------
+workspace : str
+    Target workspace to create / populate.
+source : str
+    Either ``"template"`` (copy the full contents of the
+    ``__template__`` workspace) or ``"seed-file"`` (read from
+    ``seed_file``).
+seed_file : str (required when source=="seed-file")
+    Path to a JSON seed file with the same shape TemplateSeed consumes.
+overwrite : bool (default False)
+    On re-run (flag change), if True overwrite all keys; if False,
+    upsert-missing-only (preserves in-workspace customisations).
+
+Raises (in ``run``)
+-------------------
+When source is ``"template"``, raises ``RuntimeError`` if the
+``__template__`` workspace is empty — indicating that TemplateSeed
+hasn't run yet.  The bootstrapper's retry loop will re-attempt on
+the next cycle once the prerequisite is satisfied.
+"""
+
+import json
+
+from .. base import Initialiser
+
+TEMPLATE_WORKSPACE = "__template__"
+
+
+class WorkspaceInit(Initialiser):
+
+    def __init__(
+            self,
+            workspace="default",
+            source="template",
+            seed_file=None,
+            overwrite=False,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if source not in ("template", "seed-file"):
+            raise ValueError(
+                f"WorkspaceInit: source must be 'template' or "
+                f"'seed-file', got {source!r}"
+            )
+        if source == "seed-file" and not seed_file:
+            raise ValueError(
+                "WorkspaceInit: seed_file required when source='seed-file'"
+            )
+
+        self.workspace = workspace
+        self.source = source
+        self.seed_file = seed_file
+        self.overwrite = overwrite
+
+    async def run(self, ctx, old_flag, new_flag):
+        if self.source == "seed-file":
+            tree = self._load_seed_file()
+        else:
+            tree = await self._load_from_template(ctx)
+
+        if old_flag is None or self.overwrite:
+            await self._write_all(ctx, tree)
+        else:
+            await self._upsert_missing(ctx, tree)
+
+    def _load_seed_file(self):
+        with open(self.seed_file) as f:
+            return json.load(f)
+
+    async def _load_from_template(self, ctx):
+        """Build a seed tree from the entire ``__template__`` workspace.
+        Raises if the workspace is empty, so the bootstrapper knows
+        the prerequisite isn't met yet."""
+
+        raw_tree = await ctx.config.get_all(TEMPLATE_WORKSPACE)
+
+        tree = {}
+        total = 0
+        for type_name, entries in raw_tree.items():
+            parsed = {}
+            for key, raw in entries.items():
+                if raw is None:
+                    continue
+                try:
+                    parsed[key] = json.loads(raw)
+                except Exception:
+                    parsed[key] = raw
+                total += 1
+            if parsed:
+                tree[type_name] = parsed
+
+        if total == 0:
+            raise RuntimeError(
+                "Template workspace is empty — has TemplateSeed run yet?"
+            )
+
+        ctx.logger.debug(
+            f"Loaded {total} template entries across {len(tree)} types"
+        )
+        return tree
+
+    async def _write_all(self, ctx, tree):
+        values = []
+        for type_name, entries in tree.items():
+            for key, value in entries.items():
+                values.append((type_name, key, json.dumps(value)))
+        if values:
+            await ctx.config.put_many(self.workspace, values)
+        ctx.logger.info(
+            f"Workspace {self.workspace!r} populated with "
+            f"{len(values)} entries"
+        )
+
+    async def _upsert_missing(self, ctx, tree):
+        written = 0
+        for type_name, entries in tree.items():
+            existing = set(
+                await ctx.config.keys(self.workspace, type_name)
+            )
+            values = []
+            for key, value in entries.items():
+                if key not in existing:
+                    values.append(
+                        (type_name, key, json.dumps(value))
+                    )
+            if values:
+                await ctx.config.put_many(self.workspace, values)
+                written += len(values)
+        ctx.logger.info(
+            f"Workspace {self.workspace!r} upsert-missing: "
+            f"{written} new entries"
+        )
--- a/trustgraph-flow/trustgraph/config/service/service.py
+++ b/trustgraph-flow/trustgraph/config/service/service.py
@ -24,6 +24,21 @@ logger = logging.getLogger(__name__)

 default_ident = "config-svc"

+
+def is_reserved_workspace(workspace):
+    """Reserved workspaces are storage-only.
+
+    Any workspace id beginning with ``_`` is reserved for internal use
+    (e.g. ``__template__`` holding factory-default seed config).
+    Reads and writes work normally so bootstrap and provisioning code
+    can use the standard config API, but **change notifications for
+    reserved workspaces are suppressed**.  Services subscribed to the
+    config push therefore never see reserved-workspace events and
+    cannot accidentally act on template content as if it were live
+    state.
+    """
+    return workspace.startswith("_")
+
 default_config_request_queue = config_request_queue
 default_config_response_queue = config_response_queue
 default_config_push_queue = config_push_queue
@ -130,6 +145,21 @@ class Processor(AsyncProcessor):

    async def push(self, changes=None):

+        # Suppress notifications from reserved workspaces (ids starting
+        # with "_", e.g. "__template__").  Stored config is preserved;
+        # only the broadcast is filtered.  Keeps services oblivious to
+        # template / bootstrap state.
+        if changes:
+            filtered = {}
+            for type_name, workspaces in changes.items():
+                visible = [
+                    w for w in workspaces
+                    if not is_reserved_workspace(w)
+                ]
+                if visible:
+                    filtered[type_name] = visible
+            changes = filtered
+
        version = await self.config.get_version()

        resp = ConfigPush(
--- a/trustgraph-flow/trustgraph/embeddings/ollama/processor.py
+++ b/trustgraph-flow/trustgraph/embeddings/ollama/processor.py
@ -5,7 +5,7 @@ Input is text, output is embeddings vector.
 """
 from ... base import EmbeddingsService

-from ollama import Client
+from ollama import AsyncClient
 import os
 import logging

@ -30,24 +30,24 @@ class Processor(EmbeddingsService):
            }
        )

-        self.client = Client(host=ollama)
+        self.client = AsyncClient(host=ollama)
        self.default_model = model
        self._checked_models = set()

-    def _ensure_model(self, model_name):
+    async def _ensure_model(self, model_name):
        """Check if model exists locally, pull it if not."""
        if model_name in self._checked_models:
            return

        try:
-            self.client.show(model_name)
+            await self.client.show(model_name)
            self._checked_models.add(model_name)
        except Exception as e:
            status_code = getattr(e, 'status_code', None)
            if status_code == 404 or "not found" in str(e).lower():
                logger.info(f"Ollama model '{model_name}' not found locally. Pulling, this may take a while...")
                try:
-                    self.client.pull(model_name)
+                    await self.client.pull(model_name)
                    self._checked_models.add(model_name)
                    logger.info(f"Successfully pulled Ollama model '{model_name}'.")
                except Exception as pull_e:
@ -63,10 +63,10 @@ class Processor(EmbeddingsService):
        use_model = model or self.default_model

        # Ensure the model exists/is pulled
-        self._ensure_model(use_model)
+        await self._ensure_model(use_model)

        # Ollama handles batch input efficiently
-        embeds = self.client.embed(
+        embeds = await self.client.embed(
            model = use_model,
            input = texts
        )
--- a/trustgraph-flow/trustgraph/gateway/auth.py
+++ b/trustgraph-flow/trustgraph/gateway/auth.py
@ -1,22 +1,371 @@
+"""
+IAM-backed authentication and authorisation for the API gateway.

-class Authenticator:
+The gateway delegates both authentication ("who is this caller?")
+and authorisation ("may they do this?") to the IAM regime via the
+contract specified in docs/tech-specs/iam-contract.md.  No regime-
+specific policy (roles, scopes, claims) lives in the gateway.

-    def __init__(self, token=None, allow_all=False):
+- Authentication: API keys are resolved by IAM; JWTs are validated
+  locally against the cached signing public key.
+- Authorisation: every per-request decision is asked of IAM via
+  ``authorise(identity, capability, resource, parameters)``, with
+  results cached for the TTL the regime returns.
+"""

-        if not allow_all and token is None:
-            raise RuntimeError("Need a token")
+import asyncio
+import base64
+import hashlib
+import json
+import logging
+import time
+import uuid
+from dataclasses import dataclass, field

-        if not allow_all and token == "":
-            raise RuntimeError("Need a token")
+from aiohttp import web

-        self.token = token
-        self.allow_all = allow_all
+from cryptography.hazmat.primitives import serialization
+from cryptography.hazmat.primitives.asymmetric import ed25519

-    def permitted(self, token, roles):
+from ..base.iam_client import IamClient
+from ..base.metrics import ProducerMetrics, SubscriberMetrics
+from ..schema import (
+    IamRequest, IamResponse,
+    iam_request_queue, iam_response_queue,
+)

-        if self.allow_all: return True
+logger = logging.getLogger("auth")

-        if self.token != token: return False
+API_KEY_CACHE_TTL = 60  # seconds

-        return True
+# Upper bound on cache TTL the gateway honours for an authorisation
+# decision, regardless of what the regime suggested.  Caps the
+# revocation latency window.
+AUTHZ_CACHE_TTL_MAX = 60  # seconds

+
+@dataclass
+class Identity:
+    """The gateway-side surface of an authenticated caller.
+
+    Per the IAM contract this is a small fixed shape; regime-internal
+    state (roles, claims, group memberships) is reachable only via
+    the regime's ``authorise`` operation.  The gateway itself never
+    reads policy from this object.
+    """
+    # Opaque handle, quoted back when calling ``authorise``.  For
+    # the OSS regime this is the user record's id; the gateway
+    # treats it as a string with no semantic content.
+    handle: str
+    # The workspace this credential authenticates to.  Used by the
+    # gateway as the default-fill-in for operations that omit a
+    # workspace.  Never used as policy input.
+    workspace: str
+    # Stable identifier for audit logs.  In OSS this is the same
+    # value as ``handle``; not assumed equal in the contract.
+    principal_id: str
+    # How the credential was presented.  Non-policy; useful for
+    # logs / metrics only.
+    source: str   # "api-key" | "jwt"
+
+
+def _auth_failure():
+    return web.HTTPUnauthorized(
+        text='{"error":"auth failure"}',
+        content_type="application/json",
+    )
+
+
+def _access_denied():
+    return web.HTTPForbidden(
+        text='{"error":"access denied"}',
+        content_type="application/json",
+    )
+
+
+def _b64url_decode(s):
+    pad = "=" * (-len(s) % 4)
+    return base64.urlsafe_b64decode(s + pad)
+
+
+def _verify_jwt_eddsa(token, public_pem):
+    """Verify an Ed25519 JWT and return its claims.  Raises on any
+    validation failure.  Refuses non-EdDSA algorithms."""
+    parts = token.split(".")
+    if len(parts) != 3:
+        raise ValueError("malformed JWT")
+    h_b64, p_b64, s_b64 = parts
+    signing_input = f"{h_b64}.{p_b64}".encode("ascii")
+    header = json.loads(_b64url_decode(h_b64))
+    if header.get("alg") != "EdDSA":
+        raise ValueError(f"unsupported alg: {header.get('alg')!r}")
+
+    key = serialization.load_pem_public_key(public_pem.encode("ascii"))
+    if not isinstance(key, ed25519.Ed25519PublicKey):
+        raise ValueError("public key is not Ed25519")
+
+    signature = _b64url_decode(s_b64)
+    key.verify(signature, signing_input)  # raises InvalidSignature
+
+    claims = json.loads(_b64url_decode(p_b64))
+    exp = claims.get("exp")
+    if exp is None or exp < time.time():
+        raise ValueError("expired")
+    return claims
+
+
+class IamAuth:
+    """Resolves bearer credentials via the IAM service.
+
+    Used by every gateway endpoint that needs authentication.  Fetches
+    the IAM signing public key at startup (cached in memory).  API
+    keys are resolved via the IAM service with a local hash→identity
+    cache (short TTL so revoked keys stop working within the TTL
+    window without any push mechanism)."""
+
+    def __init__(self, backend, id="api-gateway"):
+        self.backend = backend
+        self.id = id
+
+        # Populated at start() via IAM.
+        self._signing_public_pem = None
+
+        # API-key cache: plaintext_sha256_hex -> (Identity, expires_ts)
+        self._key_cache = {}
+        self._key_cache_lock = asyncio.Lock()
+
+        # Authorisation decision cache: hash(handle, capability,
+        # resource, parameters) -> (allow_bool, expires_ts).  Holds
+        # both allows and denies — denies cached briefly to avoid
+        # hammering iam-svc with repeated rejected attempts.
+        self._authz_cache: dict[str, tuple[bool, float]] = {}
+        self._authz_cache_lock = asyncio.Lock()
+
+    # ------------------------------------------------------------------
+    # Short-lived client helper.  Mirrors the pattern used by the
+    # bootstrap framework and AsyncProcessor: a fresh uuid suffix per
+    # invocation so Pulsar exclusive subscriptions don't collide with
+    # ghosts from prior calls.
+    # ------------------------------------------------------------------
+
+    def _make_client(self):
+        rr_id = str(uuid.uuid4())
+        return IamClient(
+            backend=self.backend,
+            subscription=f"{self.id}--iam--{rr_id}",
+            consumer_name=self.id,
+            request_topic=iam_request_queue,
+            request_schema=IamRequest,
+            request_metrics=ProducerMetrics(
+                processor=self.id, flow=None, name="iam-request",
+            ),
+            response_topic=iam_response_queue,
+            response_schema=IamResponse,
+            response_metrics=SubscriberMetrics(
+                processor=self.id, flow=None, name="iam-response",
+            ),
+        )
+
+    async def _with_client(self, op):
+        """Open a short-lived IamClient, run ``op(client)``, close."""
+        client = self._make_client()
+        await client.start()
+        try:
+            return await op(client)
+        finally:
+            try:
+                await client.stop()
+            except Exception:
+                pass
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    async def start(self, max_retries=30, retry_delay=2.0):
+        """Fetch the signing public key from IAM.  Retries on
+        failure — the gateway may be starting before IAM is ready."""
+
+        async def _fetch(client):
+            return await client.get_signing_key_public()
+
+        for attempt in range(max_retries):
+            try:
+                pem = await self._with_client(_fetch)
+                if pem:
+                    self._signing_public_pem = pem
+                    logger.info(
+                        "IamAuth: fetched IAM signing public key "
+                        f"({len(pem)} bytes)"
+                    )
+                    return
+            except Exception as e:
+                logger.info(
+                    f"IamAuth: waiting for IAM signing key "
+                    f"({type(e).__name__}: {e}); "
+                    f"retry {attempt + 1}/{max_retries}"
+                )
+            await asyncio.sleep(retry_delay)
+
+        # Don't prevent startup forever.  A later authenticate() call
+        # will try again via the JWT path.
+        logger.warning(
+            "IamAuth: could not fetch IAM signing key at startup; "
+            "JWT validation will fail until it's available"
+        )
+
+    # ------------------------------------------------------------------
+    # Authentication
+    # ------------------------------------------------------------------
+
+    async def authenticate(self, request):
+        """Extract and validate the Bearer credential from an HTTP
+        request.  Returns an ``Identity``.  Raises HTTPUnauthorized
+        (401 / "auth failure") on any failure mode — the caller
+        cannot distinguish missing / malformed / invalid / expired /
+        revoked credentials."""
+
+        header = request.headers.get("Authorization", "")
+        if not header.startswith("Bearer "):
+            raise _auth_failure()
+        token = header[len("Bearer "):].strip()
+        if not token:
+            raise _auth_failure()
+
+        # API keys always start with "tg_".  JWTs have two dots and
+        # no "tg_" prefix.  Discriminate cheaply.
+        if token.startswith("tg_"):
+            return await self._resolve_api_key(token)
+        if token.count(".") == 2:
+            return self._verify_jwt(token)
+        raise _auth_failure()
+
+    def _verify_jwt(self, token):
+        if not self._signing_public_pem:
+            raise _auth_failure()
+        try:
+            claims = _verify_jwt_eddsa(token, self._signing_public_pem)
+        except Exception as e:
+            logger.debug(f"JWT validation failed: {type(e).__name__}: {e}")
+            raise _auth_failure()
+
+        sub = claims.get("sub", "")
+        ws = claims.get("workspace", "")
+        if not sub or not ws:
+            raise _auth_failure()
+
+        # JWT carries no policy state under the IAM contract;
+        # any roles / claims field is ignored here.
+        return Identity(
+            handle=sub, workspace=ws, principal_id=sub, source="jwt",
+        )
+
+    async def _resolve_api_key(self, plaintext):
+        h = hashlib.sha256(plaintext.encode("utf-8")).hexdigest()
+
+        cached = self._key_cache.get(h)
+        now = time.time()
+        if cached and cached[1] > now:
+            return cached[0]
+
+        async with self._key_cache_lock:
+            cached = self._key_cache.get(h)
+            if cached and cached[1] > now:
+                return cached[0]
+
+            try:
+                async def _call(client):
+                    return await client.resolve_api_key(plaintext)
+                # ``roles`` is returned by the OSS regime as a hint
+                # but is not consulted by the gateway; all policy
+                # decisions go through ``authorise``.
+                user_id, workspace, _roles = await self._with_client(_call)
+            except Exception as e:
+                logger.debug(
+                    f"API key resolution failed: "
+                    f"{type(e).__name__}: {e}"
+                )
+                raise _auth_failure()
+
+            if not user_id or not workspace:
+                raise _auth_failure()
+
+            identity = Identity(
+                handle=user_id, workspace=workspace,
+                principal_id=user_id, source="api-key",
+            )
+            self._key_cache[h] = (identity, now + API_KEY_CACHE_TTL)
+            return identity
+
+    # ------------------------------------------------------------------
+    # Authorisation
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _authz_cache_key(handle, capability, resource, parameters):
+        payload = json.dumps(
+            {
+                "h": handle,
+                "c": capability,
+                "r": resource or {},
+                "p": parameters or {},
+            },
+            sort_keys=True,
+            separators=(",", ":"),
+        )
+        return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+
+    async def authorise(self, identity, capability, resource, parameters):
+        """Ask the IAM regime whether ``identity`` may perform
+        ``capability`` on ``resource`` given ``parameters``.
+
+        Caches the decision for the regime's suggested TTL, clamped
+        above by ``AUTHZ_CACHE_TTL_MAX``.  Both allow and deny
+        decisions are cached (denies briefly, to avoid hammering
+        iam-svc with repeated rejected attempts).
+
+        Raises ``HTTPForbidden`` (403 / "access denied") on a deny
+        decision.  Raises ``HTTPUnauthorized`` (401 / "auth failure")
+        if the IAM service errors out — failing closed."""
+
+        key = self._authz_cache_key(
+            identity.handle, capability, resource, parameters,
+        )
+        now = time.time()
+
+        cached = self._authz_cache.get(key)
+        if cached and cached[1] > now:
+            allow, _ = cached
+            if not allow:
+                raise _access_denied()
+            return
+
+        async with self._authz_cache_lock:
+            cached = self._authz_cache.get(key)
+            if cached and cached[1] > now:
+                allow, _ = cached
+                if not allow:
+                    raise _access_denied()
+                return
+
+            try:
+                async def _call(client):
+                    return await client.authorise(
+                        identity.handle, capability,
+                        resource or {}, parameters or {},
+                    )
+                allow, ttl = await self._with_client(_call)
+            except Exception as e:
+                logger.warning(
+                    f"authorise failed: {type(e).__name__}: {e}; "
+                    f"failing closed for "
+                    f"{identity.principal_id!r} cap={capability!r}"
+                )
+                raise _auth_failure()
+
+            ttl = max(0, min(int(ttl or 0), AUTHZ_CACHE_TTL_MAX))
+            self._authz_cache[key] = (bool(allow), now + ttl)
+
+            if not allow:
+                raise _access_denied()
+            return
--- a/trustgraph-flow/trustgraph/gateway/capabilities.py
+++ b/trustgraph-flow/trustgraph/gateway/capabilities.py
@ -0,0 +1,100 @@
+"""
+Gateway-side authorisation entry points.
+
+Under the IAM contract (see docs/tech-specs/iam-contract.md) the
+gateway holds *no* policy state.  Roles, capability sets, and
+workspace-scope rules all live in the IAM regime (iam-svc for OSS).
+This module is the thin surface the gateway uses to ask the regime
+for a decision:
+
+- ``PUBLIC`` / ``AUTHENTICATED`` sentinels for endpoints that don't
+  go through capability-based authorisation.
+- :func:`enforce` — authenticate-only, then ask the regime.
+- :func:`enforce_workspace` — default-fill the workspace from the
+  caller's bound workspace and ask the regime, with the workspace
+  treated as the resource address.
+
+The capability strings themselves are an open vocabulary — see
+docs/tech-specs/capabilities.md.  The gateway does not validate them
+beyond passing them through; an unknown capability simply produces a
+deny verdict from the regime.
+"""
+
+from aiohttp import web
+
+
+PUBLIC = "__public__"
+AUTHENTICATED = "__authenticated__"
+
+
+def access_denied():
+    return web.HTTPForbidden(
+        text='{"error":"access denied"}',
+        content_type="application/json",
+    )
+
+
+def auth_failure():
+    return web.HTTPUnauthorized(
+        text='{"error":"auth failure"}',
+        content_type="application/json",
+    )
+
+
+async def enforce(request, auth, capability):
+    """Authenticate the caller and (for non-sentinel capabilities)
+    ask the IAM regime whether they may invoke ``capability``.
+
+    The resource is system-level (``{}``) and parameters are empty —
+    use :func:`enforce_workspace` for workspace-scoped endpoints, or
+    drive authorisation through the operation registry for richer
+    cases.
+
+    - ``PUBLIC``: returns ``None`` — no authentication.
+    - ``AUTHENTICATED``: returns the ``Identity`` — no authorisation.
+    - capability string: returns the ``Identity`` if the regime
+      allows; raises ``HTTPForbidden`` otherwise.
+    """
+    if capability == PUBLIC:
+        return None
+
+    identity = await auth.authenticate(request)
+
+    if capability == AUTHENTICATED:
+        return identity
+
+    await auth.authorise(identity, capability, {}, {})
+    return identity
+
+
+async def enforce_workspace(data, identity, auth, capability=None):
+    """Default-fill the workspace on a request body and (optionally)
+    authorise the caller for ``capability`` against that workspace.
+
+    - Target workspace = ``data["workspace"]`` if supplied, else the
+      caller's bound workspace.
+    - On success, ``data["workspace"]`` is overwritten with the
+      resolved value so downstream code sees a single canonical
+      address.
+    - When ``capability`` is given, the regime is asked whether the
+      caller may invoke ``capability`` on ``{workspace: target}``.
+      Raises ``HTTPForbidden`` on a deny.
+
+    For ``capability=None`` no authorisation call is made — the
+    caller has presumably already authorised via :func:`enforce`
+    (handy for endpoints that authorise once then resolve workspace
+    on the body before forwarding).
+    """
+    if not isinstance(data, dict):
+        return data
+
+    requested = data.get("workspace", "")
+    target = requested or identity.workspace
+    data["workspace"] = target
+
+    if capability is not None:
+        await auth.authorise(
+            identity, capability, {"workspace": target}, {},
+        )
+
+    return data
--- a/trustgraph-flow/trustgraph/gateway/dispatch/iam.py
+++ b/trustgraph-flow/trustgraph/gateway/dispatch/iam.py
@ -0,0 +1,40 @@
+
+from ... schema import IamRequest, IamResponse
+from ... schema import iam_request_queue, iam_response_queue
+from ... messaging import TranslatorRegistry
+
+from . requestor import ServiceRequestor
+
+
+class IamRequestor(ServiceRequestor):
+    def __init__(self, backend, consumer, subscriber, timeout=120,
+                 request_queue=None, response_queue=None):
+
+        if request_queue is None:
+            request_queue = iam_request_queue
+        if response_queue is None:
+            response_queue = iam_response_queue
+
+        super().__init__(
+            backend=backend,
+            consumer_name=consumer,
+            subscription=subscriber,
+            request_queue=request_queue,
+            response_queue=response_queue,
+            request_schema=IamRequest,
+            response_schema=IamResponse,
+            timeout=timeout,
+        )
+
+        self.request_translator = (
+            TranslatorRegistry.get_request_translator("iam")
+        )
+        self.response_translator = (
+            TranslatorRegistry.get_response_translator("iam")
+        )
+
+    def to_request(self, body):
+        return self.request_translator.decode(body)
+
+    def from_response(self, message):
+        return self.response_translator.encode_with_completion(message)
--- a/trustgraph-flow/trustgraph/gateway/dispatch/manager.py
+++ b/trustgraph-flow/trustgraph/gateway/dispatch/manager.py
@ -9,6 +9,7 @@ logger = logging.getLogger(__name__)

 from . config import ConfigRequestor
 from . flow import FlowRequestor
+from . iam import IamRequestor
 from . librarian import LibrarianRequestor
 from . knowledge import KnowledgeRequestor
 from . collection_management import CollectionManagementRequestor
@ -72,6 +73,7 @@ request_response_dispatchers = {
 global_dispatchers = {
    "config": ConfigRequestor,
    "flow": FlowRequestor,
+    "iam": IamRequestor,
    "librarian": LibrarianRequestor,
    "knowledge": KnowledgeRequestor,
    "collection-management": CollectionManagementRequestor,
@ -105,13 +107,31 @@ class DispatcherWrapper:

 class DispatcherManager:

-    def __init__(self, backend, config_receiver, prefix="api-gateway",
-                 queue_overrides=None):
+    def __init__(self, backend, config_receiver, auth,
+                 prefix="api-gateway", queue_overrides=None):
+        """
+        ``auth`` is required.  It flows into the Mux for first-frame
+        WebSocket authentication and into downstream dispatcher
+        construction.  There is no permissive default — constructing
+        a DispatcherManager without an authenticator would be a
+        silent downgrade to no-auth on the socket path.
+        """
+        if auth is None:
+            raise ValueError(
+                "DispatcherManager requires an 'auth' argument — there "
+                "is no no-auth mode"
+            )
+
        self.backend = backend
        self.config_receiver = config_receiver
        self.config_receiver.add_handler(self)
        self.prefix = prefix

+        # Gateway IamAuth — used by the socket Mux for first-frame
+        # auth and by any dispatcher that needs to resolve caller
+        # identity out-of-band.
+        self.auth = auth
+
        # Store queue overrides for global services
        # Format: {"config": {"request": "...", "response": "..."}, ...}
        self.queue_overrides = queue_overrides or {}
@ -163,6 +183,15 @@ class DispatcherManager:
    def dispatch_global_service(self):
        return DispatcherWrapper(self.process_global_service)

+    def dispatch_auth_iam(self):
+        """Pre-configured IAM dispatcher for the gateway's auth
+        endpoints (login, bootstrap, change-password).  Pins the
+        kind to ``iam`` so these handlers don't have to supply URL
+        params the global dispatcher would expect."""
+        async def _process(data, responder):
+            return await self.invoke_global_service(data, responder, "iam")
+        return DispatcherWrapper(_process)
+
    def dispatch_core_export(self):
        return DispatcherWrapper(self.process_core_export)

@ -314,7 +343,10 @@ class DispatcherManager:

    async def process_socket(self, ws, running, params):

-        dispatcher = Mux(self, ws, running)
+        # The mux self-authenticates via the first-frame protocol;
+        # pass the gateway's IamAuth so it can validate tokens
+        # without reaching back into the endpoint layer.
+        dispatcher = Mux(self, ws, running, auth=self.auth)

        return dispatcher

--- a/trustgraph-flow/trustgraph/gateway/dispatch/mux.py
+++ b/trustgraph-flow/trustgraph/gateway/dispatch/mux.py
@ -16,11 +16,28 @@ MAX_QUEUE_SIZE = 10

 class Mux:

-    def __init__(self, dispatcher_manager, ws, running):
+    def __init__(self, dispatcher_manager, ws, running, auth):
+        """
+        ``auth`` is required — the Mux implements the first-frame
+        auth protocol described in ``iam.md`` and will refuse any
+        non-auth frame until an ``auth-ok`` has been issued.  There
+        is no no-auth mode.
+        """
+        if auth is None:
+            raise ValueError(
+                "Mux requires an 'auth' argument — there is no "
+                "no-auth mode"
+            )

        self.dispatcher_manager = dispatcher_manager
        self.ws = ws
        self.running = running
+        self.auth = auth
+
+        # Authenticated identity, populated by the first-frame auth
+        # protocol.  ``None`` means the socket is not yet
+        # authenticated; any non-auth frame is refused.
+        self.identity = None

        self.q = asyncio.Queue(maxsize=MAX_QUEUE_SIZE)

@ -31,6 +48,41 @@ class Mux:
        if self.ws:
            await self.ws.close()

+    async def _handle_auth_frame(self, data):
+        """Process a ``{"type": "auth", "token": "..."}`` frame.
+        On success, updates ``self.identity`` and returns an
+        ``auth-ok`` response frame.  On failure, returns the masked
+        auth-failure frame.  Never raises — auth failures keep the
+        socket open so the client can retry without reconnecting
+        (important for browsers, which treat a handshake-time 401
+        as terminal)."""
+        token = data.get("token", "")
+        if not token:
+            await self.ws.send_json({
+                "type": "auth-failed",
+                "error": "auth failure",
+            })
+            return
+
+        class _Shim:
+            def __init__(self, tok):
+                self.headers = {"Authorization": f"Bearer {tok}"}
+
+        try:
+            identity = await self.auth.authenticate(_Shim(token))
+        except Exception:
+            await self.ws.send_json({
+                "type": "auth-failed",
+                "error": "auth failure",
+            })
+            return
+
+        self.identity = identity
+        await self.ws.send_json({
+            "type": "auth-ok",
+            "workspace": identity.workspace,
+        })
+
    async def receive(self, msg):

        request_id = None
@ -38,6 +90,16 @@ class Mux:
        try:

            data = msg.json()
+
+            # In-band auth protocol: the client sends
+            # ``{"type": "auth", "token": "..."}`` as its first frame
+            # (and any time it wants to re-auth: JWT refresh, token
+            # rotation, etc).  Auth is always required on a Mux —
+            # there is no no-auth mode.
+            if isinstance(data, dict) and data.get("type") == "auth":
+                await self._handle_auth_frame(data)
+                return
+
            request_id = data.get("id")

            if "request" not in data:
@ -46,9 +108,125 @@ class Mux:
            if "id" not in data:
                raise RuntimeError("Bad message")

+            # Reject all non-auth frames until an ``auth-ok`` has
+            # been issued.
+            if self.identity is None:
+                await self.ws.send_json({
+                    "id": request_id,
+                    "error": {
+                        "message": "auth failure",
+                        "type": "auth-required",
+                    },
+                    "complete": True,
+                })
+                return
+
+            # Per-service capability gating.  Resolved through the
+            # operation registry so the WS path matches what HTTP
+            # callers see — same authority, same caps.
+            #
+            # Lookup mirrors the HTTP routing decision in
+            # ``request_task``: presence of ``flow`` on the envelope
+            # means a flow-level data-plane service (graph-rag,
+            # agent, …); absence means a workspace-level service
+            # (config, flow management, librarian, …) whose specific
+            # operation is in the inner request body.  ``iam`` is
+            # treated as workspace-level too — its operations are
+            # registered with bare names, no kind prefix.
+            from ..registry import lookup as _registry_lookup
+            from ..capabilities import enforce_workspace
+            from aiohttp import web as _web
+
+            service = data.get("service", "")
+            inner = data.get("request") or {}
+            inner_op = inner.get("operation", "") if isinstance(inner, dict) else ""
+
+            if data.get("flow"):
+                op = _registry_lookup(f"flow-service:{service}")
+            elif service == "iam":
+                op = _registry_lookup(inner_op) if inner_op else None
+            else:
+                op = _registry_lookup(f"{service}:{inner_op}") if inner_op else None
+
+            if op is None:
+                await self.ws.send_json({
+                    "id": request_id,
+                    "error": {
+                        "message": "unknown service",
+                        "type": "unknown-service",
+                    },
+                    "complete": True,
+                })
+                return
+
+            # Resolve workspace first (default-fill from the caller's
+            # bound workspace), then ask the regime to authorise the
+            # service-level capability against the matched
+            # operation's resource shape.
+            try:
+                await enforce_workspace(data, self.identity, self.auth)
+                if isinstance(inner, dict):
+                    await enforce_workspace(inner, self.identity, self.auth)
+
+                if data.get("flow"):
+                    resource = {
+                        "workspace": data.get("workspace", ""),
+                        "flow": data.get("flow", ""),
+                    }
+                    parameters = {}
+                else:
+                    # Build a minimal RequestContext so the matched
+                    # operation's own extractors decide resource and
+                    # parameters — same path the HTTP endpoints take.
+                    from ..registry import RequestContext
+                    ctx = RequestContext(
+                        body=inner if isinstance(inner, dict) else {},
+                        match_info={},
+                        identity=self.identity,
+                    )
+                    resource = op.extract_resource(ctx)
+                    parameters = op.extract_parameters(ctx)
+
+                await self.auth.authorise(
+                    self.identity, op.capability, resource, parameters,
+                )
+            except _web.HTTPForbidden:
+                await self.ws.send_json({
+                    "id": request_id,
+                    "error": {
+                        "message": "access denied",
+                        "type": "access-denied",
+                    },
+                    "complete": True,
+                })
+                return
+            except _web.HTTPUnauthorized:
+                await self.ws.send_json({
+                    "id": request_id,
+                    "error": {
+                        "message": "auth failure",
+                        "type": "auth-required",
+                    },
+                    "complete": True,
+                })
+                return
+
+            workspace = data["workspace"]
+
+            # Plumb authenticated caller's handle as ``actor`` so
+            # iam-svc handlers (whoami, future actor-scoped checks)
+            # know who is calling.  Overwrite any caller-supplied
+            # value so it can't be spoofed over the WS.
+            if (
+                service == "iam"
+                and isinstance(data.get("request"), dict)
+                and self.identity is not None
+            ):
+                data["request"]["actor"] = self.identity.handle
+
            await self.q.put((
                    data["id"],
-                    data.get("workspace", "default"),
+                    workspace,
                    data.get("flow"),
                    data["service"],
                    data["request"]
--- a/trustgraph-flow/trustgraph/gateway/endpoint/auth_endpoints.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/auth_endpoints.py
@ -0,0 +1,131 @@
+"""
+Gateway auth endpoints.
+
+Three dedicated paths:
+  POST /api/v1/auth/login            — unauthenticated; username/password → JWT
+  POST /api/v1/auth/bootstrap         — unauthenticated; IAM bootstrap op
+  POST /api/v1/auth/change-password   — authenticated; any role
+
+These are the only IAM-surface operations that can be reached from
+outside.  Everything else routes through ``/api/v1/iam`` gated by
+``users:admin``.
+"""
+
+import logging
+
+from aiohttp import web
+
+from .. capabilities import enforce, PUBLIC, AUTHENTICATED
+
+logger = logging.getLogger("auth-endpoints")
+logger.setLevel(logging.INFO)
+
+
+class AuthEndpoints:
+    """Groups the three auth-surface handlers.  Each forwards to the
+    IAM service via the existing ``IamRequestor`` dispatcher."""
+
+    def __init__(self, iam_dispatcher, auth):
+        self.iam = iam_dispatcher
+        self.auth = auth
+
+    async def start(self):
+        pass
+
+    def add_routes(self, app):
+        app.add_routes([
+            web.post("/api/v1/auth/login", self.login),
+            web.post("/api/v1/auth/bootstrap", self.bootstrap),
+            web.post(
+                "/api/v1/auth/bootstrap-status",
+                self.bootstrap_status,
+            ),
+            web.post(
+                "/api/v1/auth/change-password",
+                self.change_password,
+            ),
+        ])
+
+    async def _forward(self, body):
+        async def responder(x, fin):
+            pass
+        return await self.iam.process(body, responder)
+
+    async def login(self, request):
+        """Public.  Accepts {username, password, workspace?}.  Returns
+        {jwt, jwt_expires} on success; IAM's masked auth failure on
+        anything else."""
+        await enforce(request, self.auth, PUBLIC)
+        try:
+            body = await request.json()
+        except Exception:
+            return web.json_response(
+                {"error": "invalid json"}, status=400,
+            )
+        req = {
+            "operation": "login",
+            "username": body.get("username", ""),
+            "password": body.get("password", ""),
+            "workspace": body.get("workspace", ""),
+        }
+        resp = await self._forward(req)
+        if "error" in resp:
+            return web.json_response(
+                {"error": "auth failure"}, status=401,
+            )
+        return web.json_response(resp)
+
+    async def bootstrap(self, request):
+        """Public.  Valid only when IAM is running in bootstrap mode
+        with empty tables.  In every other case the IAM service
+        returns a masked auth-failure."""
+        await enforce(request, self.auth, PUBLIC)
+        resp = await self._forward({"operation": "bootstrap"})
+        if "error" in resp:
+            return web.json_response(
+                {"error": "auth failure"}, status=401,
+            )
+        return web.json_response(resp)
+
+    async def bootstrap_status(self, request):
+        """Public, side-effect-free.  Returns ``{"bootstrap_available":
+        bool}`` so a UI can decide whether to render first-run setup
+        without invoking the consuming ``bootstrap`` op."""
+        await enforce(request, self.auth, PUBLIC)
+        resp = await self._forward({"operation": "bootstrap-status"})
+        if "error" in resp:
+            return web.json_response(
+                {"error": "auth failure"}, status=401,
+            )
+        return web.json_response(resp)
+
+    async def change_password(self, request):
+        """Authenticated (any role).  Accepts {current_password,
+        new_password}; user_id is taken from the authenticated
+        identity — the caller cannot change someone else's password
+        this way (reset-password is the admin path)."""
+        identity = await enforce(request, self.auth, AUTHENTICATED)
+        try:
+            body = await request.json()
+        except Exception:
+            return web.json_response(
+                {"error": "invalid json"}, status=400,
+            )
+        req = {
+            "operation": "change-password",
+            "user_id": identity.handle,
+            "password": body.get("current_password", ""),
+            "new_password": body.get("new_password", ""),
+        }
+        resp = await self._forward(req)
+        if "error" in resp:
+            err_type = resp.get("error", {}).get("type", "")
+            if err_type == "auth-failed":
+                return web.json_response(
+                    {"error": "auth failure"}, status=401,
+                )
+            return web.json_response(
+                {"error": resp.get("error", {}).get("message", "error")},
+                status=400,
+            )
+        return web.json_response(resp)
--- a/trustgraph-flow/trustgraph/gateway/endpoint/constant_endpoint.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/constant_endpoint.py
@ -1,28 +1,27 @@

-import asyncio
-from aiohttp import web
-import uuid
 import logging

+from aiohttp import web
+
+from .. capabilities import enforce, enforce_workspace
+
 logger = logging.getLogger("endpoint")
 logger.setLevel(logging.INFO)

+
 class ConstantEndpoint:

-    def __init__(self, endpoint_path, auth, dispatcher):
+    def __init__(self, endpoint_path, auth, dispatcher, capability):

        self.path = endpoint_path
-
        self.auth = auth
-        self.operation = "service"
-
+        self.capability = capability
        self.dispatcher = dispatcher

    async def start(self):
        pass

    def add_routes(self, app):
-
        app.add_routes([
            web.post(self.path, self.handle),
        ])
@ -31,22 +30,14 @@ class ConstantEndpoint:

        logger.debug(f"Processing request: {request.path}")

-        try:
-            ht = request.headers["Authorization"]
-            tokens = ht.split(" ", 2)
-            if tokens[0] != "Bearer":
-                return web.HTTPUnauthorized()
-            token = tokens[1]
-        except:
-            token = ""
-
-        if not self.auth.permitted(token, self.operation):
-            return web.HTTPUnauthorized()
+        identity = await enforce(request, self.auth, self.capability)

        try:
-
            data = await request.json()

+            if identity is not None:
+                await enforce_workspace(data, identity, self.auth)
+
            async def responder(x, fin):
                pass

@ -54,10 +45,8 @@ class ConstantEndpoint:

            return web.json_response(resp)

+        except web.HTTPException:
+            raise
        except Exception as e:
-            logging.error(f"Exception: {e}")
-
-            return web.json_response(
-                { "error": str(e) }
-            )
-
+            logger.error(f"Exception: {e}", exc_info=True)
+            return web.json_response({"error": str(e)})
--- a/trustgraph-flow/trustgraph/gateway/endpoint/i18n.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/i18n.py
@ -4,16 +4,18 @@ from aiohttp import web

 from trustgraph.i18n import get_language_pack

+from .. capabilities import enforce
+
 logger = logging.getLogger("endpoint")
 logger.setLevel(logging.INFO)


 class I18nPackEndpoint:

-    def __init__(self, endpoint_path: str, auth):
+    def __init__(self, endpoint_path: str, auth, capability):
        self.path = endpoint_path
        self.auth = auth
-        self.operation = "service"
+        self.capability = capability

    async def start(self):
        pass
@ -26,26 +28,13 @@ class I18nPackEndpoint:
    async def handle(self, request):
        logger.debug(f"Processing i18n pack request: {request.path}")

-        token = ""
-        try:
-            ht = request.headers["Authorization"]
-            tokens = ht.split(" ", 2)
-            if tokens[0] != "Bearer":
-                return web.HTTPUnauthorized()
-            token = tokens[1]
-        except Exception:
-            token = ""
-
-        if not self.auth.permitted(token, self.operation):
-            return web.HTTPUnauthorized()
+        await enforce(request, self.auth, self.capability)

        lang = request.match_info.get("lang") or "en"

-        # This is a path traversal defense, and is a critical sec defense.
-        # Do not remove!
+        # Path-traversal defense — critical, do not remove.
        if "/" in lang or ".." in lang:
            return web.HTTPBadRequest(reason="Invalid language code")

        pack = get_language_pack(lang)
-
        return web.json_response(pack)
--- a/trustgraph-flow/trustgraph/gateway/endpoint/iam_endpoint.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/iam_endpoint.py
@ -0,0 +1,114 @@
+"""
+Registry-driven /api/v1/iam endpoint.
+
+The gateway no longer gates IAM management with a single coarse
+``users:admin`` capability.  Instead, each operation declares its
+own capability + resource shape in the registry (``registry.py``);
+this endpoint reads the body's ``operation`` field, looks up the
+declaration, and asks the IAM regime to authorise the call.
+
+Operations not in the registry produce a 400 ``unknown operation``.
+This is the gateway's primary mechanism for fail-closed gating of
+the IAM surface — the registry is the source of truth.
+"""
+
+import logging
+
+from aiohttp import web
+
+from .. capabilities import (
+    PUBLIC, AUTHENTICATED, auth_failure,
+)
+from .. registry import lookup, RequestContext
+
+logger = logging.getLogger("iam-endpoint")
+logger.setLevel(logging.INFO)
+
+
+class IamEndpoint:
+    """POST /api/v1/iam — generic forwarder gated by the operation
+    registry.  The IAM dispatcher (``iam_dispatcher``) forwards the
+    body verbatim to iam-svc once authorisation succeeds."""
+
+    def __init__(self, endpoint_path, auth, dispatcher):
+        self.path = endpoint_path
+        self.auth = auth
+        self.dispatcher = dispatcher
+
+    async def start(self):
+        pass
+
+    def add_routes(self, app):
+        app.add_routes([web.post(self.path, self.handle)])
+
+    async def handle(self, request):
+        try:
+            body = await request.json()
+        except Exception:
+            return web.json_response(
+                {"error": "invalid json"}, status=400,
+            )
+        if not isinstance(body, dict):
+            return web.json_response(
+                {"error": "body must be an object"}, status=400,
+            )
+
+        op_name = body.get("operation", "")
+        op = lookup(op_name)
+        if op is None:
+            return web.json_response(
+                {"error": "unknown operation"}, status=400,
+            )
+
+        # Authentication: required for everything except PUBLIC.
+        identity = None
+        if op.capability != PUBLIC:
+            try:
+                identity = await self.auth.authenticate(request)
+            except web.HTTPException:
+                raise
+
+        # Authorisation: capability sentinels short-circuit the
+        # regime call; capability strings go through authorise().
+        if op.capability not in (PUBLIC, AUTHENTICATED):
+            ctx = RequestContext(
+                body=body,
+                match_info=dict(request.match_info),
+                identity=identity,
+            )
+            try:
+                resource = op.extract_resource(ctx)
+                parameters = op.extract_parameters(ctx)
+            except Exception as e:
+                logger.warning(
+                    f"extractor failed for {op_name!r}: "
+                    f"{type(e).__name__}: {e}"
+                )
+                return web.json_response(
+                    {"error": "bad request"}, status=400,
+                )
+
+            await self.auth.authorise(
+                identity, op.capability, resource, parameters,
+            )
+
+        # Plumb the authenticated caller's handle through as ``actor``
+        # so iam-svc handlers (e.g. whoami, future actor-scoped
+        # checks) know who is making the request.  The gateway is
+        # the only authority for this — body-supplied ``actor``
+        # values are overwritten so callers can't impersonate.
+        if identity is not None:
+            body["actor"] = identity.handle
+
+        async def responder(x, fin):
+            pass
+
+        try:
+            resp = await self.dispatcher.process(body, responder)
+        except web.HTTPException:
+            raise
+        except Exception as e:
+            logger.error(f"Exception: {e}", exc_info=True)
+            return web.json_response({"error": str(e)})
+
+        return web.json_response(resp)
--- a/trustgraph-flow/trustgraph/gateway/endpoint/manager.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/manager.py
@ -8,72 +8,269 @@ from . variable_endpoint import VariableEndpoint
 from . socket import SocketEndpoint
 from . metrics import MetricsEndpoint
 from . i18n import I18nPackEndpoint
+from . auth_endpoints import AuthEndpoints
+from . iam_endpoint import IamEndpoint
+from . registry_endpoint import RegistryRoutedVariableEndpoint
+
+from .. capabilities import PUBLIC, AUTHENTICATED, auth_failure
+from .. registry import lookup as _registry_lookup, RequestContext

 from .. dispatch.manager import DispatcherManager

+
+# /api/v1/{kind} (config / flow / librarian / knowledge /
+# collection-management), /api/v1/iam, and /api/v1/flow/{flow}/...
+# routes are all gated per-operation by the registry, not by a
+# per-kind capability map.  Login / bootstrap / change-password are
+# served by AuthEndpoints with their own PUBLIC / AUTHENTICATED
+# sentinels.
+
+
+import logging as _mgr_logging
+_mgr_logger = _mgr_logging.getLogger("endpoint")
+
+
+class _RoutedVariableEndpoint:
+    """HTTP endpoint that gates per request via the operation
+    registry.  The URL's ``kind`` parameter combined with a fixed
+    ``registry_prefix`` yields the registry key — e.g. prefix
+    ``flow-service`` and kind ``agent`` looks up
+    ``flow-service:agent``.
+
+    Used for ``/api/v1/flow/{flow}/service/{kind}`` (per-flow
+    data-plane services).  ``/api/v1/{kind}`` (workspace-level
+    global services) goes through ``RegistryRoutedVariableEndpoint``
+    which discriminates on body operation as well as URL kind."""
+
+    def __init__(self, endpoint_path, auth, dispatcher, registry_prefix):
+        self.path = endpoint_path
+        self.auth = auth
+        self.dispatcher = dispatcher
+        self._registry_prefix = registry_prefix
+
+    async def start(self):
+        pass
+
+    def add_routes(self, app):
+        app.add_routes([web.post(self.path, self.handle)])
+
+    async def handle(self, request):
+        kind = request.match_info.get("kind", "")
+        op = _registry_lookup(f"{self._registry_prefix}:{kind}")
+        if op is None:
+            return web.json_response(
+                {"error": "unknown kind"}, status=404,
+            )
+
+        identity = await self.auth.authenticate(request)
+
+        try:
+            data = await request.json()
+            ctx = RequestContext(
+                body=data if isinstance(data, dict) else {},
+                match_info=dict(request.match_info),
+                identity=identity,
+            )
+            resource = op.extract_resource(ctx)
+            parameters = op.extract_parameters(ctx)
+            await self.auth.authorise(
+                identity, op.capability, resource, parameters,
+            )
+
+            async def responder(x, fin):
+                pass
+
+            resp = await self.dispatcher.process(
+                data, responder, request.match_info,
+            )
+            return web.json_response(resp)
+
+        except web.HTTPException:
+            raise
+        except Exception as e:
+            _mgr_logger.error(f"Exception: {e}", exc_info=True)
+            return web.json_response({"error": str(e)})
+
+
+class _RoutedSocketEndpoint:
+    """WebSocket endpoint gated per request via the operation
+    registry.  Like ``_RoutedVariableEndpoint`` but for the
+    streaming flow import / export socket paths."""
+
+    def __init__(self, endpoint_path, auth, dispatcher, registry_prefix):
+        self.path = endpoint_path
+        self.auth = auth
+        self.dispatcher = dispatcher
+        self._registry_prefix = registry_prefix
+
+    async def start(self):
+        pass
+
+    def add_routes(self, app):
+        app.add_routes([web.get(self.path, self.handle)])
+
+    async def handle(self, request):
+        kind = request.match_info.get("kind", "")
+        op = _registry_lookup(f"{self._registry_prefix}:{kind}")
+        if op is None:
+            return web.json_response(
+                {"error": "unknown kind"}, status=404,
+            )
+
+        token = request.query.get("token", "")
+        if not token:
+            return auth_failure()
+
+        from . socket import _QueryTokenRequest
+        try:
+            identity = await self.auth.authenticate(
+                _QueryTokenRequest(token)
+            )
+        except web.HTTPException as e:
+            return e
+
+        ctx = RequestContext(
+            body={},
+            match_info=dict(request.match_info),
+            identity=identity,
+        )
+        try:
+            resource = op.extract_resource(ctx)
+            parameters = op.extract_parameters(ctx)
+            await self.auth.authorise(
+                identity, op.capability, resource, parameters,
+            )
+        except web.HTTPException as e:
+            return e
+
+        # Delegate the websocket handling to a standalone SocketEndpoint
+        # with the resolved capability, bypassing the per-request mutation
+        # concern by instantiating fresh state.
+        ws_ep = SocketEndpoint(
+            endpoint_path=self.path,
+            auth=self.auth,
+            dispatcher=self.dispatcher,
+            capability=op.capability,
+        )
+        return await ws_ep.handle(request)
+
+
 class EndpointManager:

    def __init__(
-            self, dispatcher_manager, auth, prometheus_url, timeout=600
+            self, dispatcher_manager, auth, prometheus_url, timeout=600,
    ):

        self.dispatcher_manager = dispatcher_manager
        self.timeout = timeout

-        self.services = {
-        }
-
        self.endpoints = [
+
+            # Auth surface — public / authenticated-any.  Must come
+            # before the generic /api/v1/{kind} routes to win the
+            # match for /api/v1/auth/* paths.  aiohttp routes in
+            # registration order, so we prepend here.
+            AuthEndpoints(
+                iam_dispatcher=dispatcher_manager.dispatch_auth_iam(),
+                auth=auth,
+            ),
+
+            # /api/v1/iam — registry-driven IAM management.  Per
+            # operation gating happens inside IamEndpoint via the
+            # operation registry; the dispatcher forwards verbatim
+            # to iam-svc once authorisation has succeeded.  Listed
+            # before the generic /api/v1/{kind} route so it wins
+            # the match for "iam".
+            IamEndpoint(
+                endpoint_path="/api/v1/iam",
+                auth=auth,
+                dispatcher=dispatcher_manager.dispatch_auth_iam(),
+            ),
+
            I18nPackEndpoint(
-                endpoint_path = "/api/v1/i18n/packs/{lang}",
-                auth = auth,
+                endpoint_path="/api/v1/i18n/packs/{lang}",
+                auth=auth,
+                capability=PUBLIC,
            ),
            MetricsEndpoint(
-                endpoint_path = "/api/metrics",
-                prometheus_url = prometheus_url,
-                auth = auth,
+                endpoint_path="/api/metrics",
+                prometheus_url=prometheus_url,
+                auth=auth,
+                capability="metrics:read",
            ),
-            VariableEndpoint(
-                endpoint_path = "/api/v1/{kind}", auth = auth,
-                dispatcher = dispatcher_manager.dispatch_global_service(),
+
+            # Global services: registry-driven per-operation gating.
+            # Each kind+op combination has a registry entry that
+            # declares its capability and resource shape.  Listed
+            # after the IAM and auth-surface routes; aiohttp's
+            # path matcher prefers the more-specific path so this
+            # variable route doesn't shadow them.
+            RegistryRoutedVariableEndpoint(
+                endpoint_path="/api/v1/{kind}",
+                auth=auth,
+                dispatcher=dispatcher_manager.dispatch_global_service(),
            ),
+
+            # /api/v1/socket: WebSocket handshake accepts
+            # unconditionally; the Mux dispatcher runs the
+            # first-frame auth protocol.  Handshake-time 401s break
+            # browser reconnection, so authentication is always
+            # in-band for this endpoint.
            SocketEndpoint(
-                endpoint_path = "/api/v1/socket",
-                auth = auth,
-                dispatcher = dispatcher_manager.dispatch_socket()
+                endpoint_path="/api/v1/socket",
+                auth=auth,
+                dispatcher=dispatcher_manager.dispatch_socket(),
+                capability=AUTHENTICATED,  # informational only; bypassed
+                in_band_auth=True,
            ),
-            VariableEndpoint(
-                endpoint_path = "/api/v1/flow/{flow}/service/{kind}",
-                auth = auth,
-                dispatcher = dispatcher_manager.dispatch_flow_service(),
+
+            # Per-flow request/response services — gated per
+            # ``flow-service:<kind>`` registry entry.
+            _RoutedVariableEndpoint(
+                endpoint_path="/api/v1/flow/{flow}/service/{kind}",
+                auth=auth,
+                dispatcher=dispatcher_manager.dispatch_flow_service(),
+                registry_prefix="flow-service",
            ),
-            SocketEndpoint(
-                endpoint_path = "/api/v1/flow/{flow}/import/{kind}",
-                auth = auth,
-                dispatcher = dispatcher_manager.dispatch_flow_import()
+
+            # Per-flow streaming import/export — gated per
+            # ``flow-import:<kind>`` / ``flow-export:<kind>`` registry
+            # entry.
+            _RoutedSocketEndpoint(
+                endpoint_path="/api/v1/flow/{flow}/import/{kind}",
+                auth=auth,
+                dispatcher=dispatcher_manager.dispatch_flow_import(),
+                registry_prefix="flow-import",
            ),
-            SocketEndpoint(
-                endpoint_path = "/api/v1/flow/{flow}/export/{kind}",
-                auth = auth,
-                dispatcher = dispatcher_manager.dispatch_flow_export()
+            _RoutedSocketEndpoint(
+                endpoint_path="/api/v1/flow/{flow}/export/{kind}",
+                auth=auth,
+                dispatcher=dispatcher_manager.dispatch_flow_export(),
+                registry_prefix="flow-export",
+            ),
+
+            StreamEndpoint(
+                endpoint_path="/api/v1/import-core",
+                auth=auth,
+                method="POST",
+                dispatcher=dispatcher_manager.dispatch_core_import(),
+                # Cross-subject import — require the admin bundle via a
+                # single representative capability.
+                capability="users:admin",
            ),
            StreamEndpoint(
-                endpoint_path = "/api/v1/import-core",
-                auth = auth,
-                method = "POST",
-                dispatcher = dispatcher_manager.dispatch_core_import(),
+                endpoint_path="/api/v1/export-core",
+                auth=auth,
+                method="GET",
+                dispatcher=dispatcher_manager.dispatch_core_export(),
+                capability="users:admin",
            ),
            StreamEndpoint(
-                endpoint_path = "/api/v1/export-core",
-                auth = auth,
-                method = "GET",
-                dispatcher = dispatcher_manager.dispatch_core_export(),
-            ),
-            StreamEndpoint(
-                endpoint_path = "/api/v1/document-stream",
-                auth = auth,
-                method = "GET",
-                dispatcher = dispatcher_manager.dispatch_document_stream(),
+                endpoint_path="/api/v1/document-stream",
+                auth=auth,
+                method="GET",
+                dispatcher=dispatcher_manager.dispatch_document_stream(),
+                capability="documents:read",
            ),
        ]

@ -84,4 +281,3 @@ class EndpointManager:
    async def start(self):
        for ep in self.endpoints:
            await ep.start()
-
--- a/trustgraph-flow/trustgraph/gateway/endpoint/metrics.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/metrics.py
@ -10,17 +10,19 @@ import asyncio
 import uuid
 import logging

+from .. capabilities import enforce
+
 logger = logging.getLogger("endpoint")
 logger.setLevel(logging.INFO)

 class MetricsEndpoint:

-    def __init__(self, prometheus_url, endpoint_path, auth):
+    def __init__(self, prometheus_url, endpoint_path, auth, capability):

        self.prometheus_url = prometheus_url
        self.path = endpoint_path
        self.auth = auth
-        self.operation = "service"
+        self.capability = capability

    async def start(self):
        pass
@ -35,38 +37,39 @@ class MetricsEndpoint:

        logger.debug(f"Processing metrics request: {request.path}")

-        try:
-            ht = request.headers["Authorization"]
-            tokens = ht.split(" ", 2)
-            if tokens[0] != "Bearer":
-                return web.HTTPUnauthorized()
-            token = tokens[1]
-        except:
-            token = ""
+        await enforce(request, self.auth, self.capability)

-        if not self.auth.permitted(token, self.operation):
-            return web.HTTPUnauthorized()
+        path = request.match_info["path"]
+        url = (
+            self.prometheus_url + "/api/v1/" + path + "?" +
+            request.query_string
+        )

        try:

-            path = request.match_info["path"]
-
            async with aiohttp.ClientSession() as session:
-
-                url = (
-                    self.prometheus_url + "/api/v1/" + path + "?" +
-                    request.query_string
-                )
-
                async with session.get(url) as resp:
                    return web.Response(
                        status=resp.status,
                        text=await resp.text()
                    )

+        except aiohttp.ClientConnectionError as e:
+
+            # Upstream unreachable (connect refused, DNS failure,
+            # server disconnect).  Distinguish from our own errors so
+            # callers know where the fault is.
+            logger.error(f"Metrics upstream {url} unreachable: {e}")
+            return web.Response(
+                status=502,
+                text=f"Bad Gateway: metrics upstream unreachable: {e}",
+            )
+
        except Exception as e:

-            logging.error(f"Exception: {e}")
-
-            raise web.HTTPInternalServerError()
+            logger.error(f"Metrics proxy exception: {e}", exc_info=True)
+            return web.Response(
+                status=500,
+                text=f"Internal Server Error: {e}",
+            )

--- a/trustgraph-flow/trustgraph/gateway/endpoint/registry_endpoint.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/registry_endpoint.py
@ -0,0 +1,123 @@
+"""
+Registry-driven dispatch for ``/api/v1/{kind}`` global services.
+
+The body's ``operation`` field plus the URL's ``{kind}`` together
+form the canonical operation name (``<kind>:<operation>``) that the
+gateway looks up in ``registry.py``.  The matched operation
+declares its capability and resource shape; this endpoint asks the
+IAM regime to authorise the call before forwarding the body
+verbatim to the backend dispatcher.
+
+The dispatcher is the same ``dispatch_global_service()`` factory the
+old coarse path used; only the gating layer has changed.
+
+Operations not present in the registry are rejected with 400
+``unknown operation`` — fail closed.
+"""
+
+import logging
+
+from aiohttp import web
+
+from .. capabilities import (
+    PUBLIC, AUTHENTICATED, auth_failure,
+)
+from .. registry import lookup, RequestContext
+
+logger = logging.getLogger("registry-endpoint")
+logger.setLevel(logging.INFO)
+
+
+class RegistryRoutedVariableEndpoint:
+    """POST /api/v1/{kind} — kind comes from the URL, operation comes
+    from the body, both are joined as the registry key."""
+
+    def __init__(self, endpoint_path, auth, dispatcher):
+        self.path = endpoint_path
+        self.auth = auth
+        self.dispatcher = dispatcher
+
+    async def start(self):
+        pass
+
+    def add_routes(self, app):
+        app.add_routes([web.post(self.path, self.handle)])
+
+    async def handle(self, request):
+        kind = request.match_info.get("kind", "")
+        if not kind:
+            return web.json_response(
+                {"error": "missing kind"}, status=404,
+            )
+
+        try:
+            body = await request.json()
+        except Exception:
+            return web.json_response(
+                {"error": "invalid json"}, status=400,
+            )
+        if not isinstance(body, dict):
+            return web.json_response(
+                {"error": "body must be an object"}, status=400,
+            )
+
+        op_name = body.get("operation", "")
+        if not op_name:
+            return web.json_response(
+                {"error": "missing operation"}, status=400,
+            )
+
+        registry_key = f"{kind}:{op_name}"
+        op = lookup(registry_key)
+        if op is None:
+            return web.json_response(
+                {"error": "unknown operation"}, status=400,
+            )
+
+        identity = None
+        if op.capability != PUBLIC:
+            identity = await self.auth.authenticate(request)
+
+        if op.capability not in (PUBLIC, AUTHENTICATED):
+            ctx = RequestContext(
+                body=body,
+                match_info=dict(request.match_info),
+                identity=identity,
+            )
+            try:
+                resource = op.extract_resource(ctx)
+                parameters = op.extract_parameters(ctx)
+            except Exception as e:
+                logger.warning(
+                    f"extractor failed for {registry_key!r}: "
+                    f"{type(e).__name__}: {e}"
+                )
+                return web.json_response(
+                    {"error": "bad request"}, status=400,
+                )
+
+            await self.auth.authorise(
+                identity, op.capability, resource, parameters,
+            )
+
+            # Default-fill workspace into the body so downstream
+            # dispatchers see the canonical resolved value.  The
+            # extractor has already pulled the workspace out;
+            # mirror it back to the body for the verbatim forward.
+            if "workspace" in resource:
+                body["workspace"] = resource["workspace"]
+
+        async def responder(x, fin):
+            pass
+
+        try:
+            resp = await self.dispatcher.process(
+                body, responder, request.match_info,
+            )
+        except web.HTTPException:
+            raise
+        except Exception as e:
+            logger.error(f"Exception: {e}", exc_info=True)
+            return web.json_response({"error": str(e)})
+
+        return web.json_response(resp)
--- a/trustgraph-flow/trustgraph/gateway/endpoint/socket.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/socket.py
@ -4,6 +4,9 @@ from aiohttp import web, WSMsgType
 import logging

 from .. running import Running
+from .. capabilities import (
+    PUBLIC, AUTHENTICATED, auth_failure,
+)

 logger = logging.getLogger("socket")
 logger.setLevel(logging.INFO)
@ -11,12 +14,25 @@ logger.setLevel(logging.INFO)
 class SocketEndpoint:

    def __init__(
-            self, endpoint_path, auth, dispatcher,
+            self, endpoint_path, auth, dispatcher, capability,
+            in_band_auth=False,
    ):
+        """
+        ``in_band_auth=True`` skips the handshake-time auth check.
+        The WebSocket handshake always succeeds; the dispatcher is
+        expected to gate itself via the first-frame auth protocol
+        (see ``Mux``).
+
+        This avoids the browser problem where a 401 on the handshake
+        is treated as permanent and prevents reconnection, and lets
+        long-lived sockets refresh their credential mid-session by
+        sending a new auth frame.
+        """

        self.path = endpoint_path
        self.auth = auth
-        self.operation = "socket"
+        self.capability = capability
+        self.in_band_auth = in_band_auth

        self.dispatcher = dispatcher

@ -61,15 +77,33 @@ class SocketEndpoint:
            raise
        
    async def handle(self, request):
-        """Enhanced handler with better cleanup"""
-        try:
-            token = request.query['token']
-        except:
-            token = ""
+        """Enhanced handler with better cleanup.
+
+        Auth: WebSocket clients pass the bearer token on the
+        ``?token=...`` query string; we wrap it into a synthetic
+        Authorization header before delegating to the standard auth
+        path so the IAM-backed flow (JWT / API key) applies uniformly.
+        The first-frame auth protocol described in the IAM spec is
+        a future upgrade."""
+
+        if not self.in_band_auth and self.capability != PUBLIC:
+            token = request.query.get("token", "")
+            if not token:
+                return auth_failure()
+            try:
+                identity = await self.auth.authenticate(
+                    _QueryTokenRequest(token)
+                )
+            except web.HTTPException as e:
+                return e
+            if self.capability != AUTHENTICATED:
+                try:
+                    await self.auth.authorise(
+                        identity, self.capability, {}, {},
+                    )
+                except web.HTTPException as e:
+                    return e

-        if not self.auth.permitted(token, self.operation):
-            return web.HTTPUnauthorized()
-        
        # 50MB max message size
        ws = web.WebSocketResponse(max_msg_size=52428800)

@ -150,3 +184,11 @@ class SocketEndpoint:
            web.get(self.path, self.handle),
        ])

+
+class _QueryTokenRequest:
+    """Minimal shim that exposes headers["Authorization"] to
+    IamAuth.authenticate(), derived from a query-string token."""
+
+    def __init__(self, token):
+        self.headers = {"Authorization": f"Bearer {token}"}
+
--- a/trustgraph-flow/trustgraph/gateway/endpoint/stream_endpoint.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/stream_endpoint.py
@ -1,82 +1,64 @@

-import asyncio
-from aiohttp import web
 import logging

+from aiohttp import web
+
+from .. capabilities import enforce
+
 logger = logging.getLogger("endpoint")
 logger.setLevel(logging.INFO)

+
 class StreamEndpoint:

-    def __init__(self, endpoint_path, auth, dispatcher, method="POST"):
-
+    def __init__(
+            self, endpoint_path, auth, dispatcher, capability, method="POST",
+    ):
        self.path = endpoint_path
-
        self.auth = auth
-        self.operation = "service"
+        self.capability = capability
        self.method = method
-
        self.dispatcher = dispatcher

    async def start(self):
        pass

    def add_routes(self, app):
-
        if self.method == "POST":
-            app.add_routes([
-                web.post(self.path, self.handle),
-            ])
+            app.add_routes([web.post(self.path, self.handle)])
        elif self.method == "GET":
-            app.add_routes([
-                web.get(self.path, self.handle),
-            ])
+            app.add_routes([web.get(self.path, self.handle)])
        else:
-            raise RuntimeError("Bad method" + self.method)
+            raise RuntimeError("Bad method " + self.method)

    async def handle(self, request):

        logger.debug(f"Processing request: {request.path}")

-        try:
-            ht = request.headers["Authorization"]
-            tokens = ht.split(" ", 2)
-            if tokens[0] != "Bearer":
-                return web.HTTPUnauthorized()
-            token = tokens[1]
-        except:
-            token = ""
-
-        if not self.auth.permitted(token, self.operation):
-            return web.HTTPUnauthorized()
+        await enforce(request, self.auth, self.capability)

        try:
-
            data = request.content

            async def error(err):
-                return web.HTTPInternalServerError(text = err)
+                return web.HTTPInternalServerError(text=err)

            async def ok(
-                    status=200, reason="OK", type="application/octet-stream"
+                    status=200, reason="OK",
+                    type="application/octet-stream",
            ):
                response = web.StreamResponse(
-                    status = status, reason = reason,
-                    headers = {"Content-Type": type}
+                    status=status, reason=reason,
+                    headers={"Content-Type": type},
                )
                await response.prepare(request)
                return response

-            resp = await self.dispatcher.process(
-                data, error, ok, request
-            )
-
+            resp = await self.dispatcher.process(data, error, ok, request)
            return resp

+        except web.HTTPException:
+            raise
        except Exception as e:
-            logging.error(f"Exception: {e}")
-
-            return web.json_response(
-                { "error": str(e) }
-            )
-
+            logger.error(f"Exception: {e}", exc_info=True)
+            return web.json_response({"error": str(e)})
--- a/trustgraph-flow/trustgraph/gateway/endpoint/variable_endpoint.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/variable_endpoint.py
@ -1,27 +1,27 @@

-import asyncio
-from aiohttp import web
 import logging

+from aiohttp import web
+
+from .. capabilities import enforce, enforce_workspace
+
 logger = logging.getLogger("endpoint")
 logger.setLevel(logging.INFO)

+
 class VariableEndpoint:

-    def __init__(self, endpoint_path, auth, dispatcher):
+    def __init__(self, endpoint_path, auth, dispatcher, capability):

        self.path = endpoint_path
-
        self.auth = auth
-        self.operation = "service"
-
+        self.capability = capability
        self.dispatcher = dispatcher

    async def start(self):
        pass

    def add_routes(self, app):
-
        app.add_routes([
            web.post(self.path, self.handle),
        ])
@ -30,35 +30,25 @@ class VariableEndpoint:

        logger.debug(f"Processing request: {request.path}")

-        try:
-            ht = request.headers["Authorization"]
-            tokens = ht.split(" ", 2)
-            if tokens[0] != "Bearer":
-                return web.HTTPUnauthorized()
-            token = tokens[1]
-        except:
-            token = ""
-
-        if not self.auth.permitted(token, self.operation):
-            return web.HTTPUnauthorized()
+        identity = await enforce(request, self.auth, self.capability)

        try:
-
            data = await request.json()

+            if identity is not None:
+                await enforce_workspace(data, identity, self.auth)
+
            async def responder(x, fin):
                pass

            resp = await self.dispatcher.process(
-                data, responder, request.match_info
+                data, responder, request.match_info,
            )

            return web.json_response(resp)

+        except web.HTTPException:
+            raise
        except Exception as e:
-            logging.error(f"Exception: {e}")
-
-            return web.json_response(
-                { "error": str(e) }
-            )
-
+            logger.error(f"Exception: {e}", exc_info=True)
+            return web.json_response({"error": str(e)})
--- a/trustgraph-flow/trustgraph/gateway/registry.py
+++ b/trustgraph-flow/trustgraph/gateway/registry.py
@ -0,0 +1,533 @@
+"""
+Gateway operation registry.
+
+Single declarative table mapping each operation the gateway
+recognises to:
+
+- The capability the IAM regime is asked to authorise against.
+- The resource level (system / workspace / flow) — determines the
+  shape of the resource identifier handed to ``authorise``.
+- Extractors that build the resource and parameters from the
+  request context.
+
+This is a gateway-internal concept.  It is not part of the IAM
+contract — the contract specifies what arguments ``authorise``
+receives; the registry is how the gateway populates them.
+
+See docs/tech-specs/iam-contract.md for the contract and
+docs/tech-specs/iam.md for the request anatomy.
+"""
+
+from dataclasses import dataclass, field
+from typing import Any, Callable
+
+
+# Sentinels for operations that don't go through capability-based
+# authorisation.  Mirror the values used in capabilities.py so the
+# gateway endpoint layer can recognise them uniformly.
+PUBLIC = "__public__"
+AUTHENTICATED = "__authenticated__"
+
+
+class ResourceLevel:
+    """Where the operation's resource lives.
+
+    ``SYSTEM``    — operation acts on a deployment-level resource
+                    (the user registry, the workspace registry,
+                    the signing key).  resource = {}.  Workspace,
+                    if relevant, is a parameter, not an address.
+
+    ``WORKSPACE`` — operation acts on something within a workspace
+                    (config, library, knowledge, collections, flow
+                    lifecycle).  resource = {workspace}.
+
+    ``FLOW``      — operation acts on something within a flow
+                    within a workspace (graph, agent, llm, etc.).
+                    resource = {workspace, flow}.
+    """
+    SYSTEM = "system"
+    WORKSPACE = "workspace"
+    FLOW = "flow"
+
+
+@dataclass
+class RequestContext:
+    """The bundle of inputs the registry's extractors operate on.
+    Assembled by the gateway from the incoming request after
+    authentication."""
+
+    # Parsed JSON body (HTTP) or inner request payload (WebSocket).
+    body: dict = field(default_factory=dict)
+
+    # URL path components (HTTP) or WebSocket envelope routing
+    # fields (id, service, workspace, flow).
+    match_info: dict = field(default_factory=dict)
+
+    # Authenticated identity for default-fill-in.  Always present
+    # by the time extractors run, except for PUBLIC operations
+    # where it is None.
+    identity: Any = None
+
+
+@dataclass
+class Operation:
+    """Declared operation the gateway can dispatch + authorise."""
+
+    # Canonical operation name (used for registry lookup, audit,
+    # debug logs).  Mirrors the operation strings in the IAM
+    # service and other backends where applicable.
+    name: str
+
+    # Capability required to invoke this operation.  Either a
+    # string from the capability vocabulary in capabilities.md, or
+    # the PUBLIC / AUTHENTICATED sentinel for operations that
+    # don't go through capability-based authorisation.
+    capability: str
+
+    # Where the operation's resource lives.  Determines the
+    # shape of the resource argument passed to authorise.
+    resource_level: str
+
+    # Build the resource identifier from the request context.
+    # Returns a dict with the appropriate components for the
+    # resource level: {} for SYSTEM, {workspace} for WORKSPACE,
+    # {workspace, flow} for FLOW.  Default-fill-in of workspace
+    # from identity.workspace happens here when applicable.
+    extract_resource: Callable[[RequestContext], dict]
+
+    # Build the parameters dict — decision-relevant fields the
+    # operation supplied that are not part of the resource
+    # address.  E.g. workspace association on a system-level
+    # user-registry operation.
+    extract_parameters: Callable[[RequestContext], dict]
+
+
+# ---------------------------------------------------------------------------
+# Registry storage.
+# ---------------------------------------------------------------------------
+
+
+_REGISTRY: dict[str, Operation] = {}
+
+
+def register(op: Operation) -> None:
+    if op.name in _REGISTRY:
+        raise RuntimeError(
+            f"operation {op.name!r} already registered"
+        )
+    _REGISTRY[op.name] = op
+
+
+def lookup(name: str) -> Operation | None:
+    return _REGISTRY.get(name)
+
+
+def all_operations() -> list[Operation]:
+    return list(_REGISTRY.values())
+
+
+# ---------------------------------------------------------------------------
+# Common extractor helpers.
+# ---------------------------------------------------------------------------
+
+
+def _empty_resource(_ctx: RequestContext) -> dict:
+    """System-level resource: empty dict."""
+    return {}
+
+
+def _workspace_from_body(ctx: RequestContext) -> dict:
+    """Workspace-level resource sourced from the request body's
+    workspace field, defaulting to the caller's bound workspace."""
+    ws = (ctx.body.get("workspace") if isinstance(ctx.body, dict) else "")
+    if not ws and ctx.identity is not None:
+        ws = ctx.identity.workspace
+    return {"workspace": ws}
+
+
+def _flow_from_match_info(ctx: RequestContext) -> dict:
+    """Flow-level resource sourced from URL path components or WS
+    envelope fields.  Both ``workspace`` and ``flow`` are required;
+    no default-fill-in (the address is the operation's identity)."""
+    return {
+        "workspace": ctx.match_info.get("workspace", ""),
+        "flow": ctx.match_info.get("flow", ""),
+    }
+
+
+def _no_parameters(_ctx: RequestContext) -> dict:
+    return {}
+
+
+def _body_as_parameters(ctx: RequestContext) -> dict:
+    """All body fields are parameters — used when the operation's
+    body is small and uniformly decision-relevant (e.g. user-
+    registry ops where the body's user.workspace is what the
+    regime checks against the admin's scope)."""
+    return dict(ctx.body) if isinstance(ctx.body, dict) else {}
+
+
+def _workspace_param_only(ctx: RequestContext) -> dict:
+    """Parameters dict carrying only the workspace association.
+    Used by system-level operations (e.g. user-registry ops) where
+    the workspace isn't part of the resource address but is the
+    field the regime uses to scope the admin's authority.
+
+    Pulls the workspace from the inner ``user`` / ``workspace_record``
+    body field if present (create-user, create-workspace), then from
+    the top-level body, then from the caller's bound workspace."""
+    body = ctx.body if isinstance(ctx.body, dict) else {}
+    inner_user = body.get("user") if isinstance(body.get("user"), dict) else {}
+    inner_ws = (
+        body.get("workspace_record")
+        if isinstance(body.get("workspace_record"), dict) else {}
+    )
+    ws = (
+        inner_user.get("workspace")
+        or inner_ws.get("id")
+        or body.get("workspace")
+    )
+    if not ws and ctx.identity is not None:
+        ws = ctx.identity.workspace
+    return {"workspace": ws or ""}
+
+
+# ---------------------------------------------------------------------------
+# Operation registrations.
+#
+# The gateway looks operations up by their canonical name (the same
+# string the request body / WS envelope carries in its ``operation``
+# field where applicable).  Auth-surface operations (login, bootstrap,
+# change-password) are not listed here — they have their own routes
+# in auth_endpoints.py and use PUBLIC / AUTHENTICATED sentinels
+# directly.  Pure gateway↔IAM internal operations (resolve-api-key,
+# authorise, authorise-many, get-signing-key-public) are likewise
+# excluded; they are never invoked over the public API.
+# ---------------------------------------------------------------------------
+
+
+# IAM management operations.  All routed through /api/v1/iam, body
+# carries ``operation`` plus operation-specific fields.
+
+# User registry: SYSTEM-level resource (users are global, identified
+# by handle).  The admin's authority is scoped per workspace via the
+# parameters {workspace} field — that's what the regime checks
+# against the admin's role workspace_scope.
+register(Operation(
+    name="create-user",
+    capability="users:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="list-users",
+    capability="users:read",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="get-user",
+    capability="users:read",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="update-user",
+    capability="users:write",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="disable-user",
+    capability="users:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="enable-user",
+    capability="users:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="delete-user",
+    capability="users:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="reset-password",
+    capability="users:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+
+
+# API keys: SYSTEM-level resource — like users, a key record exists
+# in the deployment-wide keys registry.  The workspace the key
+# authenticates to is a property of the record, not a containment;
+# it appears as a parameter so the regime can scope the admin's
+# authority to issue / list / revoke against it.
+register(Operation(
+    name="create-api-key",
+    capability="keys:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="list-api-keys",
+    capability="keys:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="revoke-api-key",
+    capability="keys:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+
+
+# Workspace registry: SYSTEM-level resource (workspaces are the
+# top-level addressable unit).  No parameters — the workspace being
+# acted on is identified by the body, not used as a scope cue.
+register(Operation(
+    name="create-workspace",
+    capability="workspaces:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+register(Operation(
+    name="list-workspaces",
+    capability="workspaces:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+register(Operation(
+    name="get-workspace",
+    capability="workspaces:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+register(Operation(
+    name="update-workspace",
+    capability="workspaces:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+register(Operation(
+    name="disable-workspace",
+    capability="workspaces:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+
+
+# Signing key: SYSTEM-level operational op.
+register(Operation(
+    name="rotate-signing-key",
+    capability="iam:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+
+
+# ---------------------------------------------------------------------------
+# Auth-surface entries.
+#
+# Listed here so the registry is the one place the gateway looks for
+# operation→capability mappings — including the sentinels for paths
+# that don't go through capability-based authorisation.  The actual
+# routing is in auth_endpoints.py; these entries let the registry-
+# driven dispatcher recognise the operation if it sees it on a
+# generic path.
+# ---------------------------------------------------------------------------
+
+register(Operation(
+    name="login",
+    capability=PUBLIC,
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+register(Operation(
+    name="bootstrap",
+    capability=PUBLIC,
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+register(Operation(
+    name="bootstrap-status",
+    capability=PUBLIC,
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+register(Operation(
+    name="change-password",
+    capability=AUTHENTICATED,
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+register(Operation(
+    name="whoami",
+    capability=AUTHENTICATED,
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+
+
+# ---------------------------------------------------------------------------
+# Generic kind/operation entries.
+#
+# Names are ``<kind>:<operation>`` so the registry key is unique
+# across dispatchers.  All entries below are workspace-level
+# resources (workspace defaulted from the caller's bound workspace
+# if absent).  Read/write distinction maps to the existing
+# ``<subject>:read`` / ``<subject>:write`` capability vocabulary
+# defined in capabilities.md.
+# ---------------------------------------------------------------------------
+
+
+def _register_kind_op(kind: str, op: str, capability: str) -> None:
+    """Helper: register a workspace-level kind:op with the standard
+    extractors (workspace from body, no extra parameters)."""
+    register(Operation(
+        name=f"{kind}:{op}",
+        capability=capability,
+        resource_level=ResourceLevel.WORKSPACE,
+        extract_resource=_workspace_from_body,
+        extract_parameters=_no_parameters,
+    ))
+
+
+# config: KV-style workspace config service.
+for _op in ("get", "list", "getvalues", "getvalues-all-ws", "config"):
+    _register_kind_op("config", _op, "config:read")
+for _op in ("put", "delete"):
+    _register_kind_op("config", _op, "config:write")
+
+
+# flow: flow-blueprint and flow-lifecycle service.
+for _op in ("list-blueprints", "get-blueprint", "list-flows", "get-flow"):
+    _register_kind_op("flow", _op, "flows:read")
+for _op in ("put-blueprint", "delete-blueprint", "start-flow", "stop-flow"):
+    _register_kind_op("flow", _op, "flows:write")
+
+
+# librarian: document storage and processing service.
+for _op in (
+    "get-document-metadata", "get-document-content",
+    "stream-document", "list-documents", "list-processing",
+    "get-upload-status", "list-uploads",
+):
+    _register_kind_op("librarian", _op, "documents:read")
+for _op in (
+    "add-document", "remove-document", "update-document",
+    "add-processing", "remove-processing",
+    "begin-upload", "upload-chunk", "complete-upload", "abort-upload",
+):
+    _register_kind_op("librarian", _op, "documents:write")
+
+
+# knowledge: knowledge-graph core service.
+for _op in ("get-kg-core", "list-kg-cores"):
+    _register_kind_op("knowledge", _op, "knowledge:read")
+for _op in ("put-kg-core", "delete-kg-core",
+            "load-kg-core", "unload-kg-core"):
+    _register_kind_op("knowledge", _op, "knowledge:write")
+
+
+# collection-management: workspace collection lifecycle.
+_register_kind_op("collection-management", "list-collections", "collections:read")
+for _op in ("update-collection", "delete-collection"):
+    _register_kind_op("collection-management", _op, "collections:write")
+
+
+# ---------------------------------------------------------------------------
+# Per-flow data-plane services.
+#
+# /api/v1/flow/{flow}/service/{kind} and the streaming
+# /api/v1/flow/{flow}/{import,export}/{kind} paths.  No body-level
+# ``operation`` discriminator — the URL kind is the operation
+# identity.  Resource is FLOW level (workspace + flow).
+#
+# Names: ``flow-service:<kind>``, ``flow-import:<kind>``,
+# ``flow-export:<kind>``.
+# ---------------------------------------------------------------------------
+
+
+def _register_flow_kind(prefix: str, kind: str, capability: str) -> None:
+    register(Operation(
+        name=f"{prefix}:{kind}",
+        capability=capability,
+        resource_level=ResourceLevel.FLOW,
+        extract_resource=_flow_from_match_info,
+        extract_parameters=_no_parameters,
+    ))
+
+
+# Request/response services on /api/v1/flow/{flow}/service/{kind}.
+_FLOW_SERVICES = {
+    "agent": "agent",
+    "text-completion": "llm",
+    "prompt": "llm",
+    "mcp-tool": "mcp",
+    "graph-rag": "graph:read",
+    "document-rag": "documents:read",
+    "embeddings": "embeddings",
+    "graph-embeddings": "graph:read",
+    "document-embeddings": "documents:read",
+    "triples": "graph:read",
+    "rows": "rows:read",
+    "nlp-query": "rows:read",
+    "structured-query": "rows:read",
+    "structured-diag": "rows:read",
+    "row-embeddings": "rows:read",
+    "sparql": "graph:read",
+}
+for _kind, _cap in _FLOW_SERVICES.items():
+    _register_flow_kind("flow-service", _kind, _cap)
+
+
+# Streaming import socket endpoints.
+_FLOW_IMPORTS = {
+    "triples": "graph:write",
+    "graph-embeddings": "graph:write",
+    "document-embeddings": "documents:write",
+    "entity-contexts": "documents:write",
+    "rows": "rows:write",
+}
+for _kind, _cap in _FLOW_IMPORTS.items():
+    _register_flow_kind("flow-import", _kind, _cap)
+
+
+# Streaming export socket endpoints.
+_FLOW_EXPORTS = {
+    "triples": "graph:read",
+    "graph-embeddings": "graph:read",
+    "document-embeddings": "documents:read",
+    "entity-contexts": "documents:read",
+}
+for _kind, _cap in _FLOW_EXPORTS.items():
+    _register_flow_kind("flow-export", _kind, _cap)
--- a/trustgraph-flow/trustgraph/gateway/service.py
+++ b/trustgraph-flow/trustgraph/gateway/service.py
@ -12,7 +12,7 @@ import os
 from trustgraph.base.logging import setup_logging, add_logging_args
 from trustgraph.base.pubsub import get_pubsub, add_pubsub_args

-from . auth import Authenticator
+from . auth import IamAuth
 from . config.receiver import ConfigReceiver
 from . dispatch.manager import DispatcherManager

@ -35,7 +35,6 @@ default_prometheus_url = os.getenv("PROMETHEUS_URL", "http://prometheus:9090")
 default_pulsar_api_key = os.getenv("PULSAR_API_KEY", None)
 default_timeout = 600
 default_port = 8088
-default_api_token = os.getenv("GATEWAY_SECRET", "")

 class Api:

@ -60,13 +59,14 @@ class Api:
        if not self.prometheus_url.endswith("/"):
            self.prometheus_url += "/"

-        api_token = config.get("api_token", default_api_token)
-
-        # Token not set, or token equal empty string means no auth
-        if api_token:
-            self.auth = Authenticator(token=api_token)
-        else:
-            self.auth = Authenticator(allow_all=True)
+        # IAM-backed authentication.  The legacy GATEWAY_SECRET
+        # shared-token path has been removed — there is no
+        # "open for everyone" fallback.  The gateway cannot
+        # authenticate any request until IAM is reachable.
+        self.auth = IamAuth(
+            backend=self.pubsub_backend,
+            id=config.get("id", "api-gateway"),
+        )

        self.config_receiver = ConfigReceiver(self.pubsub_backend)

@ -118,6 +118,7 @@ class Api:
            config_receiver = self.config_receiver,
            prefix = "gateway",
            queue_overrides = queue_overrides,
+            auth = self.auth,
        )

        self.endpoint_manager = EndpointManager(
@ -132,12 +133,18 @@ class Api:
        ]

    async def app_factory(self):
-        
+
        self.app = web.Application(
            middlewares=[],
            client_max_size=256 * 1024 * 1024
        )

+        # Fetch IAM signing public key before accepting traffic.
+        # Blocks for a bounded retry window; the gateway starts even
+        # if IAM is still unreachable (JWT validation will 401 until
+        # the key is available).
+        await self.auth.start()
+
        await self.config_receiver.start()

        for ep in self.endpoints:
@ -189,12 +196,6 @@ def run():
        help=f'API request timeout in seconds (default: {default_timeout})',
    )

-    parser.add_argument(
-        '--api-token',
-        default=default_api_token,
-        help=f'Secret API token (default: no auth)',
-    )
-
    add_logging_args(parser)

    parser.add_argument(
--- a/trustgraph-flow/trustgraph/iam/init.py
+++ b/trustgraph-flow/trustgraph/iam/init.py
--- a/trustgraph-flow/trustgraph/iam/service/init.py
+++ b/trustgraph-flow/trustgraph/iam/service/init.py
@ -0,0 +1 @@
+from . service import *
--- a/trustgraph-flow/trustgraph/iam/service/main.py
+++ b/trustgraph-flow/trustgraph/iam/service/main.py
@ -0,0 +1,4 @@
+
+from . service import run
+
+run()
--- a/trustgraph-flow/trustgraph/iam/service/iam.py
+++ b/trustgraph-flow/trustgraph/iam/service/iam.py
--- a/trustgraph-flow/trustgraph/iam/service/service.py
+++ b/trustgraph-flow/trustgraph/iam/service/service.py
@ -0,0 +1,233 @@
+"""
+IAM service processor.  Terminates the IAM request queue and forwards
+each request to the IamService business logic, then returns the
+response on the IAM response queue.
+
+Shape mirrors trustgraph.config.service.
+"""
+
+import logging
+import os
+
+from trustgraph.schema import Error
+from trustgraph.schema import IamRequest, IamResponse
+from trustgraph.schema import iam_request_queue, iam_response_queue
+
+from trustgraph.base import AsyncProcessor, Consumer, Producer
+from trustgraph.base import ConsumerMetrics, ProducerMetrics
+from trustgraph.base.cassandra_config import (
+    add_cassandra_args, resolve_cassandra_config,
+)
+
+from . iam import IamService
+
+logger = logging.getLogger(__name__)
+
+default_ident = "iam-svc"
+
+default_iam_request_queue = iam_request_queue
+default_iam_response_queue = iam_response_queue
+
+# Environment variables consulted as a fallback when the
+# corresponding params field is not set in the processor-group YAML
+# or via CLI.  Intended for K8s Secret / env-var injection so the
+# bootstrap token never has to live in the YAML (and thus in git).
+ENV_BOOTSTRAP_MODE = "IAM_BOOTSTRAP_MODE"
+ENV_BOOTSTRAP_TOKEN = "IAM_BOOTSTRAP_TOKEN"
+
+
+class Processor(AsyncProcessor):
+
+    def __init__(self, **params):
+
+        iam_req_q = params.get(
+            "iam_request_queue", default_iam_request_queue,
+        )
+        iam_resp_q = params.get(
+            "iam_response_queue", default_iam_response_queue,
+        )
+
+        # Resolve bootstrap mode + token.  Precedence: explicit
+        # params (CLI / processor-group YAML) → environment variable
+        # → unset (fail-closed).  The env-var path is the K8s-native
+        # injection point: an `IAM_BOOTSTRAP_TOKEN` from a Secret
+        # never has to land in the YAML, and therefore never enters
+        # git history.
+        bootstrap_mode = (
+            params.get("bootstrap_mode")
+            or os.environ.get(ENV_BOOTSTRAP_MODE)
+        )
+        bootstrap_token = (
+            params.get("bootstrap_token")
+            or os.environ.get(ENV_BOOTSTRAP_TOKEN)
+        )
+
+        if bootstrap_mode not in ("token", "bootstrap"):
+            raise RuntimeError(
+                "iam-svc: bootstrap-mode is required.  Set to 'token' "
+                "(with bootstrap-token) for production, or 'bootstrap' "
+                "to enable the explicit bootstrap operation over the "
+                "pub/sub bus (dev / quick-start only, not safe under "
+                "public exposure).  Configurable via processor-group "
+                f"params or the {ENV_BOOTSTRAP_MODE} environment "
+                "variable.  Refusing to start."
+            )
+        if bootstrap_mode == "token" and not bootstrap_token:
+            raise RuntimeError(
+                "iam-svc: bootstrap-mode=token requires bootstrap-token "
+                f"(or the {ENV_BOOTSTRAP_TOKEN} environment "
+                "variable).  Refusing to start."
+            )
+        if bootstrap_mode == "bootstrap" and bootstrap_token:
+            raise RuntimeError(
+                "iam-svc: bootstrap-token is not accepted when "
+                "bootstrap-mode=bootstrap.  Ambiguous intent.  "
+                "Refusing to start."
+            )
+
+        self.bootstrap_mode = bootstrap_mode
+        self.bootstrap_token = bootstrap_token
+
+        cassandra_host = params.get("cassandra_host")
+        cassandra_username = params.get("cassandra_username")
+        cassandra_password = params.get("cassandra_password")
+
+        hosts, username, password, keyspace = resolve_cassandra_config(
+            host=cassandra_host,
+            username=cassandra_username,
+            password=cassandra_password,
+            default_keyspace="iam",
+        )
+
+        self.cassandra_host = hosts
+        self.cassandra_username = username
+        self.cassandra_password = password
+
+        super().__init__(
+            **params | {
+                "iam_request_schema": IamRequest.__name__,
+                "iam_response_schema": IamResponse.__name__,
+                "cassandra_host": self.cassandra_host,
+                "cassandra_username": self.cassandra_username,
+                "cassandra_password": self.cassandra_password,
+            }
+        )
+
+        iam_request_metrics = ConsumerMetrics(
+            processor=self.id, flow=None, name="iam-request",
+        )
+        iam_response_metrics = ProducerMetrics(
+            processor=self.id, flow=None, name="iam-response",
+        )
+
+        self.iam_request_topic = iam_req_q
+
+        self.iam_request_consumer = Consumer(
+            taskgroup=self.taskgroup,
+            backend=self.pubsub,
+            flow=None,
+            topic=iam_req_q,
+            subscriber=self.id,
+            schema=IamRequest,
+            handler=self.on_iam_request,
+            metrics=iam_request_metrics,
+        )
+
+        self.iam_response_producer = Producer(
+            backend=self.pubsub,
+            topic=iam_resp_q,
+            schema=IamResponse,
+            metrics=iam_response_metrics,
+        )
+
+        self.iam = IamService(
+            host=self.cassandra_host,
+            username=self.cassandra_username,
+            password=self.cassandra_password,
+            keyspace=keyspace,
+            bootstrap_mode=self.bootstrap_mode,
+            bootstrap_token=self.bootstrap_token,
+        )
+
+        logger.info(
+            f"IAM service initialised (bootstrap-mode={self.bootstrap_mode})"
+        )
+
+    async def start(self):
+        await self.pubsub.ensure_topic(self.iam_request_topic)
+        # Token-mode auto-bootstrap runs before we accept requests so
+        # the first inbound call always sees a populated table.
+        await self.iam.auto_bootstrap_if_token_mode()
+        await self.iam_request_consumer.start()
+
+    async def on_iam_request(self, msg, consumer, flow):
+
+        id = None
+        try:
+            v = msg.value()
+            id = msg.properties()["id"]
+            logger.debug(
+                f"Handling IAM request {id} op={v.operation!r}"
+            )
+            resp = await self.iam.handle(v)
+            await self.iam_response_producer.send(
+                resp, properties={"id": id},
+            )
+        except Exception as e:
+            logger.error(
+                f"IAM request failed: {type(e).__name__}: {e}",
+                exc_info=True,
+            )
+            resp = IamResponse(
+                error=Error(type="internal-error", message=str(e)),
+            )
+            if id is not None:
+                await self.iam_response_producer.send(
+                    resp, properties={"id": id},
+                )
+
+    @staticmethod
+    def add_args(parser):
+        AsyncProcessor.add_args(parser)
+
+        parser.add_argument(
+            "--iam-request-queue",
+            default=default_iam_request_queue,
+            help=f"IAM request queue (default: {default_iam_request_queue})",
+        )
+        parser.add_argument(
+            "--iam-response-queue",
+            default=default_iam_response_queue,
+            help=f"IAM response queue (default: {default_iam_response_queue})",
+        )
+        parser.add_argument(
+            "--bootstrap-mode",
+            default=None,
+            choices=["token", "bootstrap"],
+            help=(
+                "IAM bootstrap mode (required).  "
+                "'token' = operator supplies the initial admin API "
+                "key via --bootstrap-token; auto-seeds on first start, "
+                "bootstrap operation refused.  "
+                "'bootstrap' = bootstrap operation is live over the "
+                "bus until tables are populated; a token is generated "
+                "and returned by tg-bootstrap-iam.  Unsafe to run "
+                "'bootstrap' mode with public exposure."
+            ),
+        )
+        parser.add_argument(
+            "--bootstrap-token",
+            default=None,
+            help=(
+                "Initial admin API key plaintext, required when "
+                "--bootstrap-mode=token.  Treat as a one-time "
+                "credential: the operator should rotate to a new key "
+                "and revoke this one after first use."
+            ),
+        )
+
+        add_cassandra_args(parser)
+
+
+def run():
+    Processor.launch(default_ident, __doc__)
--- a/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py
@ -4,7 +4,7 @@ Simple LLM service, performs text prompt completion using an Ollama service.
 Input is prompt, output is response.
 """

-from ollama import Client
+from ollama import AsyncClient
 import os
 import logging

@ -38,23 +38,23 @@ class Processor(LlmService):

        self.default_model = model
        self.temperature = temperature
-        self.llm = Client(host=ollama)
+        self.llm = AsyncClient(host=ollama)
        self._checked_models = set()

-    def _ensure_model(self, model_name):
+    async def _ensure_model(self, model_name):
        """Check if model exists locally, pull it if not."""
        if model_name in self._checked_models:
            return

        try:
-            self.llm.show(model_name)
+            await self.llm.show(model_name)
            self._checked_models.add(model_name)
        except Exception as e:
            status_code = getattr(e, 'status_code', None)
            if status_code == 404 or "not found" in str(e).lower():
                logger.info(f"Ollama model '{model_name}' not found locally. Pulling, this may take a while...")
                try:
-                    self.llm.pull(model_name)
+                    await self.llm.pull(model_name)
                    self._checked_models.add(model_name)
                    logger.info(f"Successfully pulled Ollama model '{model_name}'.")
                except Exception as pull_e:
@ -66,9 +66,9 @@ class Processor(LlmService):

        # Use provided model or fall back to default
        model_name = model or self.default_model
-        
+
        # Ensure the model exists/is pulled
-        self._ensure_model(model_name)
+        await self._ensure_model(model_name)
        # Use provided temperature or fall back to default
        effective_temperature = temperature if temperature is not None else self.temperature

@ -79,7 +79,7 @@ class Processor(LlmService):

        try:

-            response = self.llm.generate(model_name, prompt, options={'temperature': effective_temperature})
+            response = await self.llm.generate(model_name, prompt, options={'temperature': effective_temperature})

            response_text = response['response']
            logger.debug("Sending response...")
@ -113,7 +113,7 @@ class Processor(LlmService):
        model_name = model or self.default_model

        # Ensure the model exists/is pulled
-        self._ensure_model(model_name)
+        await self._ensure_model(model_name)

        effective_temperature = temperature if temperature is not None else self.temperature

@ -123,7 +123,7 @@ class Processor(LlmService):
        prompt = system + "\n\n" + prompt

        try:
-            stream = self.llm.generate(
+            stream = await self.llm.generate(
                model_name,
                prompt,
                options={'temperature': effective_temperature},
@ -133,7 +133,7 @@ class Processor(LlmService):
            total_input_tokens = 0
            total_output_tokens = 0

-            for chunk in stream:
+            async for chunk in stream:
                if 'response' in chunk and chunk['response']:
                    yield LlmChunk(
                        text=chunk['response'],
--- a/trustgraph-flow/trustgraph/tables/iam.py
+++ b/trustgraph-flow/trustgraph/tables/iam.py
@ -0,0 +1,436 @@
+"""
+IAM Cassandra table store.
+
+Tables:
+  - iam_workspaces (id primary key)
+  - iam_users (id primary key) + iam_users_by_username lookup table
+    (workspace, username) -> id
+  - iam_api_keys (key_hash primary key) with secondary index on user_id
+  - iam_signing_keys (kid primary key) — RSA keypairs for JWT signing
+
+See docs/tech-specs/iam-protocol.md for the wire-level context.
+"""
+
+import logging
+
+from cassandra.cluster import Cluster
+from cassandra.auth import PlainTextAuthProvider
+from ssl import SSLContext, PROTOCOL_TLSv1_2
+
+from . cassandra_async import async_execute
+
+logger = logging.getLogger(__name__)
+
+
+class IamTableStore:
+
+    def __init__(
+            self,
+            cassandra_host, cassandra_username, cassandra_password,
+            keyspace,
+    ):
+        self.keyspace = keyspace
+
+        logger.info("IAM: connecting to Cassandra...")
+
+        if isinstance(cassandra_host, str):
+            cassandra_host = [h.strip() for h in cassandra_host.split(",")]
+
+        if cassandra_username and cassandra_password:
+            ssl_context = SSLContext(PROTOCOL_TLSv1_2)
+            auth_provider = PlainTextAuthProvider(
+                username=cassandra_username, password=cassandra_password,
+            )
+            self.cluster = Cluster(
+                cassandra_host,
+                auth_provider=auth_provider,
+                ssl_context=ssl_context,
+            )
+        else:
+            self.cluster = Cluster(cassandra_host)
+
+        self.cassandra = self.cluster.connect()
+
+        logger.info("IAM: connected.")
+
+        self._ensure_schema()
+        self._prepare_statements()
+
+    def _ensure_schema(self):
+        # FIXME: Replication factor should be configurable.
+        self.cassandra.execute(f"""
+            create keyspace if not exists {self.keyspace}
+                with replication = {{
+                    'class' : 'SimpleStrategy',
+                    'replication_factor' : 1
+                }};
+        """)
+        self.cassandra.set_keyspace(self.keyspace)
+
+        self.cassandra.execute("""
+            CREATE TABLE IF NOT EXISTS iam_workspaces (
+                id text PRIMARY KEY,
+                name text,
+                enabled boolean,
+                created timestamp
+            );
+        """)
+
+        self.cassandra.execute("""
+            CREATE TABLE IF NOT EXISTS iam_users (
+                id text PRIMARY KEY,
+                workspace text,
+                username text,
+                name text,
+                email text,
+                password_hash text,
+                roles set<text>,
+                enabled boolean,
+                must_change_password boolean,
+                created timestamp
+            );
+        """)
+
+        self.cassandra.execute("""
+            CREATE TABLE IF NOT EXISTS iam_users_by_username (
+                workspace text,
+                username text,
+                user_id text,
+                PRIMARY KEY ((workspace), username)
+            );
+        """)
+
+        self.cassandra.execute("""
+            CREATE TABLE IF NOT EXISTS iam_api_keys (
+                key_hash text PRIMARY KEY,
+                id text,
+                user_id text,
+                name text,
+                prefix text,
+                expires timestamp,
+                created timestamp,
+                last_used timestamp
+            );
+        """)
+
+        self.cassandra.execute("""
+            CREATE INDEX IF NOT EXISTS iam_api_keys_user_id_idx
+            ON iam_api_keys (user_id);
+        """)
+
+        self.cassandra.execute("""
+            CREATE INDEX IF NOT EXISTS iam_api_keys_id_idx
+            ON iam_api_keys (id);
+        """)
+
+        self.cassandra.execute("""
+            CREATE TABLE IF NOT EXISTS iam_signing_keys (
+                kid text PRIMARY KEY,
+                private_pem text,
+                public_pem text,
+                created timestamp,
+                retired timestamp
+            );
+        """)
+
+        logger.info("IAM: Cassandra schema OK.")
+
+    def _prepare_statements(self):
+        c = self.cassandra
+
+        self.put_workspace_stmt = c.prepare("""
+            INSERT INTO iam_workspaces (id, name, enabled, created)
+            VALUES (?, ?, ?, ?)
+        """)
+        self.get_workspace_stmt = c.prepare("""
+            SELECT id, name, enabled, created FROM iam_workspaces
+            WHERE id = ?
+        """)
+        self.list_workspaces_stmt = c.prepare("""
+            SELECT id, name, enabled, created FROM iam_workspaces
+        """)
+
+        self.put_user_stmt = c.prepare("""
+            INSERT INTO iam_users (
+                id, workspace, username, name, email, password_hash,
+                roles, enabled, must_change_password, created
+            )
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """)
+        self.get_user_stmt = c.prepare("""
+            SELECT id, workspace, username, name, email, password_hash,
+                   roles, enabled, must_change_password, created
+            FROM iam_users WHERE id = ?
+        """)
+        self.list_users_by_workspace_stmt = c.prepare("""
+            SELECT id, workspace, username, name, email, password_hash,
+                   roles, enabled, must_change_password, created
+            FROM iam_users WHERE workspace = ? ALLOW FILTERING
+        """)
+        self.list_users_stmt = c.prepare("""
+            SELECT id, workspace, username, name, email, password_hash,
+                   roles, enabled, must_change_password, created
+            FROM iam_users
+        """)
+
+        self.put_username_lookup_stmt = c.prepare("""
+            INSERT INTO iam_users_by_username (workspace, username, user_id)
+            VALUES (?, ?, ?)
+        """)
+        self.get_user_id_by_username_stmt = c.prepare("""
+            SELECT user_id FROM iam_users_by_username
+            WHERE workspace = ? AND username = ?
+        """)
+        self.delete_username_lookup_stmt = c.prepare("""
+            DELETE FROM iam_users_by_username
+            WHERE workspace = ? AND username = ?
+        """)
+        self.delete_user_stmt = c.prepare("""
+            DELETE FROM iam_users WHERE id = ?
+        """)
+
+        self.put_api_key_stmt = c.prepare("""
+            INSERT INTO iam_api_keys (
+                key_hash, id, user_id, name, prefix, expires,
+                created, last_used
+            )
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+        """)
+        self.get_api_key_by_hash_stmt = c.prepare("""
+            SELECT key_hash, id, user_id, name, prefix, expires,
+                   created, last_used
+            FROM iam_api_keys WHERE key_hash = ?
+        """)
+        self.get_api_key_by_id_stmt = c.prepare("""
+            SELECT key_hash, id, user_id, name, prefix, expires,
+                   created, last_used
+            FROM iam_api_keys WHERE id = ?
+        """)
+        self.list_api_keys_by_user_stmt = c.prepare("""
+            SELECT key_hash, id, user_id, name, prefix, expires,
+                   created, last_used
+            FROM iam_api_keys WHERE user_id = ?
+        """)
+        self.delete_api_key_stmt = c.prepare("""
+            DELETE FROM iam_api_keys WHERE key_hash = ?
+        """)
+
+        self.put_signing_key_stmt = c.prepare("""
+            INSERT INTO iam_signing_keys (
+                kid, private_pem, public_pem, created, retired
+            )
+            VALUES (?, ?, ?, ?, ?)
+        """)
+        self.list_signing_keys_stmt = c.prepare("""
+            SELECT kid, private_pem, public_pem, created, retired
+            FROM iam_signing_keys
+        """)
+        self.retire_signing_key_stmt = c.prepare("""
+            UPDATE iam_signing_keys SET retired = ? WHERE kid = ?
+        """)
+
+        self.update_user_profile_stmt = c.prepare("""
+            UPDATE iam_users
+            SET name = ?, email = ?, roles = ?, enabled = ?,
+                must_change_password = ?
+            WHERE id = ?
+        """)
+        self.update_user_password_stmt = c.prepare("""
+            UPDATE iam_users
+            SET password_hash = ?, must_change_password = ?
+            WHERE id = ?
+        """)
+        self.update_user_enabled_stmt = c.prepare("""
+            UPDATE iam_users SET enabled = ? WHERE id = ?
+        """)
+
+        self.update_workspace_stmt = c.prepare("""
+            UPDATE iam_workspaces SET name = ?, enabled = ?
+            WHERE id = ?
+        """)
+
+    # ------------------------------------------------------------------
+    # Workspaces
+    # ------------------------------------------------------------------
+
+    async def put_workspace(self, id, name, enabled, created):
+        await async_execute(
+            self.cassandra, self.put_workspace_stmt,
+            (id, name, enabled, created),
+        )
+
+    async def get_workspace(self, id):
+        rows = await async_execute(
+            self.cassandra, self.get_workspace_stmt, (id,),
+        )
+        return rows[0] if rows else None
+
+    async def list_workspaces(self):
+        return await async_execute(
+            self.cassandra, self.list_workspaces_stmt,
+        )
+
+    # ------------------------------------------------------------------
+    # Users
+    # ------------------------------------------------------------------
+
+    async def put_user(
+            self, id, workspace, username, name, email, password_hash,
+            roles, enabled, must_change_password, created,
+    ):
+        await async_execute(
+            self.cassandra, self.put_user_stmt,
+            (
+                id, workspace, username, name, email, password_hash,
+                set(roles) if roles else set(),
+                enabled, must_change_password, created,
+            ),
+        )
+        await async_execute(
+            self.cassandra, self.put_username_lookup_stmt,
+            (workspace, username, id),
+        )
+
+    async def get_user(self, id):
+        rows = await async_execute(
+            self.cassandra, self.get_user_stmt, (id,),
+        )
+        return rows[0] if rows else None
+
+    async def get_user_id_by_username(self, workspace, username):
+        rows = await async_execute(
+            self.cassandra, self.get_user_id_by_username_stmt,
+            (workspace, username),
+        )
+        return rows[0][0] if rows else None
+
+    async def list_users_by_workspace(self, workspace):
+        return await async_execute(
+            self.cassandra, self.list_users_by_workspace_stmt, (workspace,),
+        )
+
+    async def list_users(self):
+        """List every user across the deployment.  Used by the
+        system-level list-users handler when no workspace filter is
+        supplied; the gateway has already authorised the call against
+        the caller's authority."""
+        return await async_execute(
+            self.cassandra, self.list_users_stmt, (),
+        )
+
+    async def delete_user(self, id):
+        await async_execute(
+            self.cassandra, self.delete_user_stmt, (id,),
+        )
+
+    async def delete_username_lookup(self, workspace, username):
+        await async_execute(
+            self.cassandra, self.delete_username_lookup_stmt,
+            (workspace, username),
+        )
+
+    # ------------------------------------------------------------------
+    # API keys
+    # ------------------------------------------------------------------
+
+    async def put_api_key(
+            self, key_hash, id, user_id, name, prefix, expires,
+            created, last_used,
+    ):
+        await async_execute(
+            self.cassandra, self.put_api_key_stmt,
+            (key_hash, id, user_id, name, prefix, expires,
+             created, last_used),
+        )
+
+    async def get_api_key_by_hash(self, key_hash):
+        rows = await async_execute(
+            self.cassandra, self.get_api_key_by_hash_stmt, (key_hash,),
+        )
+        return rows[0] if rows else None
+
+    async def get_api_key_by_id(self, id):
+        rows = await async_execute(
+            self.cassandra, self.get_api_key_by_id_stmt, (id,),
+        )
+        return rows[0] if rows else None
+
+    async def list_api_keys_by_user(self, user_id):
+        return await async_execute(
+            self.cassandra, self.list_api_keys_by_user_stmt, (user_id,),
+        )
+
+    async def delete_api_key(self, key_hash):
+        await async_execute(
+            self.cassandra, self.delete_api_key_stmt, (key_hash,),
+        )
+
+    # ------------------------------------------------------------------
+    # Signing keys
+    # ------------------------------------------------------------------
+
+    async def put_signing_key(self, kid, private_pem, public_pem,
+                              created, retired):
+        await async_execute(
+            self.cassandra, self.put_signing_key_stmt,
+            (kid, private_pem, public_pem, created, retired),
+        )
+
+    async def list_signing_keys(self):
+        return await async_execute(
+            self.cassandra, self.list_signing_keys_stmt,
+        )
+
+    async def retire_signing_key(self, kid, retired):
+        await async_execute(
+            self.cassandra, self.retire_signing_key_stmt,
+            (retired, kid),
+        )
+
+    # ------------------------------------------------------------------
+    # User partial updates
+    # ------------------------------------------------------------------
+
+    async def update_user_profile(
+            self, id, name, email, roles, enabled, must_change_password,
+    ):
+        await async_execute(
+            self.cassandra, self.update_user_profile_stmt,
+            (
+                name, email,
+                set(roles) if roles else set(),
+                enabled, must_change_password, id,
+            ),
+        )
+
+    async def update_user_password(
+            self, id, password_hash, must_change_password,
+    ):
+        await async_execute(
+            self.cassandra, self.update_user_password_stmt,
+            (password_hash, must_change_password, id),
+        )
+
+    async def update_user_enabled(self, id, enabled):
+        await async_execute(
+            self.cassandra, self.update_user_enabled_stmt,
+            (enabled, id),
+        )
+
+    # ------------------------------------------------------------------
+    # Workspace updates
+    # ------------------------------------------------------------------
+
+    async def update_workspace(self, id, name, enabled):
+        await async_execute(
+            self.cassandra, self.update_workspace_stmt,
+            (name, enabled, id),
+        )
+
+    # ------------------------------------------------------------------
+    # Bootstrap helpers
+    # ------------------------------------------------------------------
+
+    async def any_workspace_exists(self):
+        rows = await self.list_workspaces()
+        return bool(rows)