Merge branch 'release/v2.4'

2026-05-05 13:22:37 +02:00 · 2026-04-29 17:56:48 +01:00 · 2026-04-29 17:56:48 +01:00 · f3434307c5
commit f3434307c5
parent 627cb1e0d8 d0850ff381
91 changed files with 10657 additions and 1218 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -75,6 +75,13 @@ jobs:
      - name: Checkout
        uses: actions/checkout@v4

+      - name: "Free up some disk space"
+        run: |
+          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          podman image prune --all --force
+          podman builder prune -a -f
+
      - name: Docker Hub token
        run: echo ${{ secrets.DOCKER_SECRET }} > docker-token.txt

--- a/dev-tools/tests/smoke/smoke_ws_queries.py
+++ b/dev-tools/tests/smoke/smoke_ws_queries.py
@ -0,0 +1,475 @@
+#!/usr/bin/env python3
+"""
+WebSocket smoke / load test that hammers a TrustGraph gateway with a
+mix of `embeddings`, `graph-embeddings`, and `triples` queries while
+keeping a target number of in-flight requests at all times.
+
+Useful for reproducing the "worker hangs after a while, all subsequent
+requests time out" failure mode — leaves enough load on the system to
+saturate worker concurrency and reports per-service success/timeout
+rates and latency distributions over time.
+
+Usage:
+    smoke_ws_queries.py --flow onto-rag --duration 120 --concurrency 20
+
+Connects via /api/v1/socket using the first-frame auth protocol.
+"""
+
+import argparse
+import asyncio
+import json
+import os
+import random
+import statistics
+import sys
+import time
+import uuid
+from collections import defaultdict
+from typing import Any
+
+import websockets
+
+
+DEFAULT_TEXT = (
+    "What caused the space shuttle to explode and what were the "
+    "main factors leading to the disaster?"
+)
+
+
+class Stats:
+    """Per-service rolling counters and latency samples."""
+
+    def __init__(self) -> None:
+        self.sent = 0
+        self.ok = 0
+        self.err = 0
+        self.timeout = 0
+        self.latencies_ms: list[float] = []
+
+    def record_ok(self, latency_ms: float) -> None:
+        self.ok += 1
+        self.latencies_ms.append(latency_ms)
+
+    def record_err(self) -> None:
+        self.err += 1
+
+    def record_timeout(self) -> None:
+        self.timeout += 1
+
+    def percentile(self, p: float) -> float:
+        if not self.latencies_ms:
+            return 0.0
+        s = sorted(self.latencies_ms)
+        idx = min(len(s) - 1, int(len(s) * p))
+        return s[idx]
+
+    def summary(self) -> str:
+        if self.latencies_ms:
+            mn = min(self.latencies_ms)
+            mx = max(self.latencies_ms)
+            mean = statistics.mean(self.latencies_ms)
+            p50 = self.percentile(0.50)
+            p95 = self.percentile(0.95)
+            p99 = self.percentile(0.99)
+            lat = (
+                f"min={mn:.0f} mean={mean:.0f} p50={p50:.0f} "
+                f"p95={p95:.0f} p99={p99:.0f} max={mx:.0f} ms"
+            )
+        else:
+            lat = "no successful samples"
+        return (
+            f"sent={self.sent} ok={self.ok} err={self.err} "
+            f"timeout={self.timeout} | {lat}"
+        )
+
+
+class WSClient:
+    """Thin async websocket client with first-frame auth and a shared
+    reader task that demuxes responses to per-request asyncio queues."""
+
+    def __init__(
+        self, url: str, token: str | None, workspace: str,
+        ping_timeout: int,
+    ) -> None:
+        self.url = url
+        self.token = token
+        self.workspace = workspace
+        self.ping_timeout = ping_timeout
+        self._ws: Any = None
+        self._pending: dict[str, asyncio.Queue] = {}
+        self._reader_task: asyncio.Task | None = None
+        self._closed = asyncio.Event()
+
+    async def connect(self) -> None:
+        ws_url = self.url.rstrip("/") + "/api/v1/socket"
+        if ws_url.startswith("http://"):
+            ws_url = "ws://" + ws_url[len("http://"):]
+        elif ws_url.startswith("https://"):
+            ws_url = "wss://" + ws_url[len("https://"):]
+        elif not (
+            ws_url.startswith("ws://") or ws_url.startswith("wss://")
+        ):
+            ws_url = "ws://" + ws_url
+
+        self._ws = await websockets.connect(
+            ws_url,
+            ping_interval=20,
+            ping_timeout=self.ping_timeout,
+            max_size=64 * 1024 * 1024,
+        )
+
+        if self.token:
+            # First-frame auth handshake.
+            await self._ws.send(json.dumps({
+                "type": "auth", "token": self.token,
+            }))
+            raw = await asyncio.wait_for(self._ws.recv(), timeout=10)
+            resp = json.loads(raw)
+            if resp.get("type") != "auth-ok":
+                await self._ws.close()
+                raise RuntimeError(f"auth failed: {resp}")
+            if "workspace" in resp:
+                # Server-resolved workspace overrides the user-supplied
+                # one, mirroring AsyncSocketClient behaviour.
+                self.workspace = resp["workspace"]
+        else:
+            print(
+                "WARNING: no token provided — skipping auth handshake. "
+                "Requests will be rejected unless the gateway is "
+                "running without IAM enforcement.",
+                file=sys.stderr,
+            )
+
+        self._reader_task = asyncio.create_task(self._reader())
+
+    async def _reader(self) -> None:
+        try:
+            async for raw in self._ws:
+                msg = json.loads(raw)
+                rid = msg.get("id")
+                if rid and rid in self._pending:
+                    await self._pending[rid].put(msg)
+        except websockets.exceptions.ConnectionClosed:
+            pass
+        except Exception as e:
+            for q in list(self._pending.values()):
+                try:
+                    q.put_nowait({"error": {"message": str(e)}})
+                except Exception:
+                    pass
+        finally:
+            self._closed.set()
+
+    async def request(
+        self, service: str, flow: str | None, body: dict, timeout: float,
+    ) -> tuple[dict | None, str | None, float]:
+        """Send one request, await final response.
+
+        Returns ``(response, error, latency_ms)``. ``response`` is None
+        on error/timeout. ``error`` describes the failure category.
+        """
+        rid = str(uuid.uuid4())
+        q: asyncio.Queue = asyncio.Queue()
+        self._pending[rid] = q
+        env = {
+            "id": rid,
+            "workspace": self.workspace,
+            "service": service,
+            "request": body,
+        }
+        if flow:
+            env["flow"] = flow
+
+        t0 = time.monotonic()
+        try:
+            await self._ws.send(json.dumps(env))
+            while True:
+                try:
+                    msg = await asyncio.wait_for(q.get(), timeout=timeout)
+                except asyncio.TimeoutError:
+                    return None, "timeout", (time.monotonic() - t0) * 1000
+                if "error" in msg and msg["error"]:
+                    err = msg["error"]
+                    err_msg = (
+                        err.get("message") if isinstance(err, dict) else str(err)
+                    )
+                    return None, f"error: {err_msg}", (time.monotonic() - t0) * 1000
+                if msg.get("complete"):
+                    return msg.get("response"), None, (time.monotonic() - t0) * 1000
+                # Otherwise an intermediate streaming chunk — keep waiting.
+        finally:
+            self._pending.pop(rid, None)
+
+    async def close(self) -> None:
+        if self._ws is not None:
+            await self._ws.close()
+        if self._reader_task is not None:
+            try:
+                await asyncio.wait_for(self._reader_task, timeout=2)
+            except (asyncio.TimeoutError, asyncio.CancelledError):
+                pass
+
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument(
+        "--url",
+        default=os.getenv("TRUSTGRAPH_URL", "http://localhost:8088/"),
+        help="Gateway URL (http or ws). Default: %(default)s",
+    )
+    p.add_argument(
+        "--token",
+        default=os.getenv("TRUSTGRAPH_TOKEN"),
+        help="Auth token (or set TRUSTGRAPH_TOKEN). Optional — if "
+             "omitted, the auth handshake is skipped (only works "
+             "when the gateway is running without IAM enforcement).",
+    )
+    p.add_argument(
+        "--workspace", default="default",
+        help="Workspace. Default: %(default)s",
+    )
+    p.add_argument(
+        "--flow", required=True,
+        help="Flow id. Comma-separated for round-robin across flows "
+             "(e.g. onto-rag,doc-rag).",
+    )
+    p.add_argument(
+        "--duration", type=int, default=60,
+        help="Test duration in seconds. Default: %(default)s",
+    )
+    p.add_argument(
+        "--concurrency", type=int, default=15,
+        help="Target in-flight request count. Default: %(default)s",
+    )
+    p.add_argument(
+        "--services",
+        default="embeddings,graph-embeddings,triples",
+        help="Comma-separated services to exercise. "
+             "Default: %(default)s",
+    )
+    p.add_argument(
+        "--limit", type=int, default=3,
+        help="limit for triples / graph-embeddings queries. "
+             "Default: %(default)s",
+    )
+    p.add_argument(
+        "--collection", default="default",
+        help="Collection. Default: %(default)s",
+    )
+    p.add_argument(
+        "--text", default=DEFAULT_TEXT,
+        help="Text to embed for embeddings/seed.",
+    )
+    p.add_argument(
+        "--vector-dim", type=int, default=384,
+        help="Dimension of synthetic vector when --no-seed is used. "
+             "Default: %(default)s",
+    )
+    p.add_argument(
+        "--no-seed", action="store_true",
+        help="Skip the embeddings warm-up call. Use a random vector "
+             "for graph-embeddings queries instead.",
+    )
+    p.add_argument(
+        "--request-timeout", type=float, default=30.0,
+        help="Per-request timeout (seconds). Default: %(default)s",
+    )
+    p.add_argument(
+        "--report-interval", type=float, default=5.0,
+        help="How often to print stats (seconds). Default: %(default)s",
+    )
+    p.add_argument(
+        "--ping-timeout", type=int, default=120,
+        help="Websocket ping timeout. Default: %(default)s",
+    )
+    p.add_argument(
+        "--seed", type=int, default=None,
+        help="Random seed (for reproducibility).",
+    )
+    return p.parse_args()
+
+
+async def seed_vector(
+    client: WSClient, flow: str, text: str, timeout: float,
+) -> list[float]:
+    """Issue one embeddings request to obtain a real vector that
+    later graph-embeddings calls can reuse."""
+    resp, err, _ = await client.request(
+        "embeddings", flow, {"texts": [text]}, timeout,
+    )
+    if err or not resp:
+        raise RuntimeError(f"seed embeddings failed: {err or resp}")
+    vectors = resp.get("vectors")
+    if not vectors:
+        raise RuntimeError(f"seed embeddings: no vectors in response: {resp}")
+    return vectors[0]
+
+
+def make_request_body(
+    service: str, args: argparse.Namespace, vector: list[float],
+) -> dict:
+    if service == "embeddings":
+        return {"texts": [args.text]}
+    if service == "graph-embeddings":
+        return {
+            "vector": vector,
+            "limit": args.limit,
+            "collection": args.collection,
+        }
+    if service == "triples":
+        return {
+            "limit": args.limit,
+            "collection": args.collection,
+        }
+    raise ValueError(f"Unknown service: {service}")
+
+
+async def worker(
+    name: int,
+    client: WSClient,
+    flows: list[str],
+    services: list[str],
+    args: argparse.Namespace,
+    vector: list[float],
+    stats: dict[str, Stats],
+    in_flight: dict[str, int],
+    stop_at: float,
+) -> None:
+    rng = random.Random((args.seed or 0) + name)
+    while time.monotonic() < stop_at:
+        svc = rng.choice(services)
+        flow = rng.choice(flows)
+        body = make_request_body(svc, args, vector)
+
+        stats[svc].sent += 1
+        in_flight[svc] += 1
+        try:
+            resp, err, lat = await client.request(
+                svc, flow, body, args.request_timeout,
+            )
+            if err == "timeout":
+                stats[svc].record_timeout()
+            elif err:
+                stats[svc].record_err()
+            else:
+                stats[svc].record_ok(lat)
+        except Exception as e:
+            stats[svc].record_err()
+            print(f"worker {name}: unexpected {svc} exception: {e}",
+                  file=sys.stderr)
+        finally:
+            in_flight[svc] -= 1
+
+
+async def reporter(
+    services: list[str],
+    stats: dict[str, Stats],
+    in_flight: dict[str, int],
+    stop_at: float,
+    interval: float,
+) -> None:
+    started = time.monotonic()
+    last_sent = {s: 0 for s in services}
+    while time.monotonic() < stop_at:
+        await asyncio.sleep(interval)
+        now = time.monotonic()
+        elapsed = now - started
+        total_inflight = sum(in_flight.values())
+        print(
+            f"\n[{elapsed:6.1f}s] in-flight={total_inflight} "
+            f"per-svc={dict(in_flight)}"
+        )
+        for svc in services:
+            s = stats[svc]
+            delta = s.sent - last_sent[svc]
+            rate = delta / interval
+            last_sent[svc] = s.sent
+            print(f"  {svc:20s} {rate:6.1f}/s | {s.summary()}")
+
+
+async def run(args: argparse.Namespace) -> int:
+    if args.seed is not None:
+        random.seed(args.seed)
+
+    services = [s.strip() for s in args.services.split(",") if s.strip()]
+    flows = [f.strip() for f in args.flow.split(",") if f.strip()]
+    valid = {"embeddings", "graph-embeddings", "triples"}
+    bad = [s for s in services if s not in valid]
+    if bad:
+        print(f"ERROR: unknown service(s): {bad}. "
+              f"Supported: {sorted(valid)}", file=sys.stderr)
+        return 2
+
+    client = WSClient(
+        args.url, args.token, args.workspace, args.ping_timeout,
+    )
+    print(f"Connecting to {args.url} ...")
+    await client.connect()
+    print(f"Connected. workspace={client.workspace} flows={flows} "
+          f"services={services} concurrency={args.concurrency} "
+          f"duration={args.duration}s")
+
+    if "graph-embeddings" in services and not args.no_seed:
+        print("Seeding embedding vector ...")
+        vector = await seed_vector(
+            client, flows[0], args.text, args.request_timeout,
+        )
+        print(f"Got vector of length {len(vector)}")
+    else:
+        vector = [random.uniform(-1.0, 1.0) for _ in range(args.vector_dim)]
+
+    stats: dict[str, Stats] = defaultdict(Stats)
+    in_flight: dict[str, int] = defaultdict(int)
+    for svc in services:
+        stats[svc]  # initialise
+        in_flight[svc] = 0
+
+    stop_at = time.monotonic() + args.duration
+    print(f"Starting load: {args.concurrency} workers for "
+          f"{args.duration}s ...")
+
+    workers = [
+        asyncio.create_task(
+            worker(
+                i, client, flows, services, args, vector,
+                stats, in_flight, stop_at,
+            )
+        )
+        for i in range(args.concurrency)
+    ]
+    rep = asyncio.create_task(
+        reporter(services, stats, in_flight, stop_at, args.report_interval)
+    )
+
+    try:
+        await asyncio.gather(*workers)
+    finally:
+        rep.cancel()
+        try:
+            await rep
+        except asyncio.CancelledError:
+            pass
+
+        print("\n=== Final results ===")
+        any_failures = False
+        for svc in services:
+            s = stats[svc]
+            print(f"  {svc:20s} {s.summary()}")
+            if s.timeout > 0 or s.err > 0:
+                any_failures = True
+
+        await client.close()
+
+    return 1 if any_failures else 0
+
+
+def main() -> int:
+    args = parse_args()
+    try:
+        return asyncio.run(run(args))
+    except KeyboardInterrupt:
+        return 130
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/docs/tech-specs/bootstrap.md
+++ b/docs/tech-specs/bootstrap.md
@ -0,0 +1,297 @@
+---
+layout: default
+title: "Bootstrap Framework Technical Specification"
+parent: "Tech Specs"
+---
+
+# Bootstrap Framework Technical Specification
+
+## Overview
+
+A generic, pluggable framework for running one-time initialisation steps
+against a TrustGraph deployment — replacing the dedicated
+`tg-init-trustgraph` container with a long-running processor that
+converges the system to a desired initial state and then idles.
+
+The framework is content-agnostic. It knows how to run, retry,
+mark-as-done, and surface failures; the actual init work lives in
+small pluggable classes called **initialisers**. Core initialisers
+ship in the `trustgraph-flow` package; enterprise and third-party
+initialisers can be loaded by dotted path without any core code
+change.
+
+## Motivation
+
+The existing `tg-init-trustgraph` is a one-shot CLI run in its own
+container. It performs two very different jobs (Pulsar topology
+setup and config seeding) in a single script, is wasteful as a whole
+container, cannot handle partial-success states, and has no way to
+extend the boot process with enterprise-specific concerns (user
+provisioning, workspace initialisation, IAM scaffolding) without
+forking the tool.
+
+A pluggable, long-running reconciler addresses all of this and slots
+naturally into the existing processor-group model.
+
+## Design
+
+### Bootstrapper Processor
+
+A single `AsyncProcessor` subclass. One entry in a processor group.
+Parameters include the processor's own identity and a list of
+**initialiser specifications** — each spec names a class (by dotted
+path), a unique instance name, a flag string, and the parameters
+that will be passed to the initialiser's constructor.
+
+On each wake the bootstrapper does the following, in order:
+
+1. Open a short-lived context (config client, flow-svc client,
+   logger). The context is torn down at the end of the wake so
+   steady-state idle cost is effectively nil.
+2. Run all **pre-service initialisers** (those that opt out of the
+   service gate — principally `PulsarTopology`, which must run
+   before the services it gates on can even come up).
+3. Check the **service gate**: cheap round-trips to config-svc and
+   flow-svc. If either fails, skip to the sleep step using the
+   short gate-retry cadence.
+4. Run all **post-service initialisers** that haven't already
+   completed at the currently-configured flag.
+5. Sleep. Cadence adapts to state (see below).
+
+### Initialiser Contract
+
+An initialiser is a class with:
+
+- A class-level `name` identifier, unique within the bootstrapper's
+  configuration. This is the key under which completion state is
+  stored.
+- A class-level `wait_for_services` flag. When `True` (the default)
+  the initialiser runs only after the service gate passes. When
+  `False`, it runs before the gate, on every wake.
+- A constructor that accepts the initialiser's own params as kwargs.
+- An async `run(ctx, old_flag, new_flag)` method that performs the
+  init work and returns on success. Any raised exception is
+  logged and treated as a transient failure — the stored flag is
+  not updated and the initialiser will re-run on the next cycle.
+
+`old_flag` is the previously-stored flag string, or `None` if the
+initialiser has never successfully run in this deployment. `new_flag`
+is the flag the operator has configured for this run. This pair
+lets an initialiser distinguish a clean first-run from a migration
+between flag versions and behave accordingly (see "Flag change and
+re-run safety" below).
+
+### Context
+
+The context is the bootstrapper-owned object passed to every
+initialiser's `run()` method. Its fields are deliberately narrow:
+
+| Field | Purpose |
+|---|---|
+| `logger` | A child logger named for the initialiser instance |
+| `config` | A short-lived `ConfigClient` for config-svc reads/writes |
+| `flow`   | A short-lived `RequestResponse` client for flow-svc |
+
+The context is always fully-populated regardless of which services
+a given initialiser uses, for symmetry. Additional fields may be
+added in future without breaking existing initialisers. Clients are
+started at the beginning of a wake cycle and stopped at the end.
+
+Initialisers that need services beyond config-svc and flow-svc are
+responsible for their own readiness checks and for raising cleanly
+when a prerequisite is not met.
+
+### Completion State
+
+Per-initialiser completion state is stored in the reserved
+`__system__` workspace, under a dedicated config type for bootstrap
+state. The stored value is the flag string that was configured when
+the initialiser last succeeded.
+
+On each cycle, for each initialiser, the bootstrapper reads the
+stored flag and compares it to the currently-configured flag. If
+they match, the initialiser is skipped silently. If they differ,
+the initialiser runs; on success, the stored flag is updated.
+
+Because the state lives in a reserved (`_`-prefixed) workspace, it
+is stored by config-svc but excluded from the config push broadcast.
+Live processors never see it and cannot act on it.
+
+### The Service Gate
+
+The gate is a cheap, bootstrapper-internal check that config-svc
+and flow-svc are both reachable and responsive. It is intentionally
+a simple pair of low-cost round-trips — a config list against
+`__system__` and a flow-svc `list-blueprints` — rather than any
+deeper health check.
+
+Its purpose is to avoid filling logs with noise and to concentrate
+retry effort during the brief window when services are coming up.
+The gate is applied only to initialisers with
+`wait_for_services=True` (the default); `False` is reserved for
+initialisers that set up infrastructure the gate itself depends on.
+
+### Adaptive Cadence
+
+The sleep between wake cycles is chosen from three tiers based on
+observed state:
+
+| Tier | Duration | When |
+|---|---|---|
+| Gate backoff | ~5 s | Services not responding — concentrate retry during startup |
+| Init retry | ~15 s | Gate passes but at least one initialiser is not yet at its configured flag — transient failures, waiting on prereqs, recently-bumped flag not yet applied |
+| Steady | ~300 s | All configured initialisers at their configured flag; gate passes; nothing to do |
+
+The short tiers ensure a fresh deployment converges quickly;
+steady state costs a single round-trip per initialiser every few
+minutes.
+
+### Failure Handling
+
+An initialiser raising an exception does not stop the bootstrapper
+or block other initialisers. Each initialiser in the cycle is
+attempted independently; failures are logged and retried on the next
+cycle. This means there is no ordered-DAG enforcement: order of
+initialisers in the configuration determines the attempt order
+within a cycle, but a dependency between two initialisers is
+expressed by the dependant raising cleanly when its prerequisite
+isn't satisfied. Over successive cycles the system converges.
+
+### Flag Change and Re-run Safety
+
+Each initialiser's completion state is a string flag chosen by the
+operator. Typically these follow a simple version pattern
+(`v1`, `v2`, ...), but the bootstrapper imposes no format.
+
+Changing the flag in the group configuration causes the
+corresponding initialiser to re-run on the next cycle. Initialisers
+must be written so that re-running after a flag bump is safe — they
+receive both the previous and the new flag and are responsible for
+either cleanly re-applying the work or performing a step-change
+migration from the prior state.
+
+This gives operators an explicit, visible mechanism for triggering
+re-initialisation. Re-runs are never implicit.
+
+## Core Initialisers
+
+The following initialisers ship in `trustgraph.bootstrap.initialisers`
+and cover the base deployment case.
+
+### PulsarTopology
+
+Creates the Pulsar tenant and the four namespaces
+(`flow`, `request`, `response`, `notify`) with appropriate
+retention policies if they don't exist.
+
+Opts out of the service gate (`wait_for_services = False`) because
+config-svc and flow-svc cannot come online until the Pulsar
+namespaces exist.
+
+Parameters: Pulsar admin URL, tenant name.
+
+Idempotent via the admin API (GET-then-PUT). Flag change causes
+re-evaluation of all namespaces; any absent are created.
+
+### TemplateSeed
+
+Populates the reserved `__template__` workspace from an external
+JSON seed file. The seed file has the standard shape of
+`{config-type: {config-key: value}}`.
+
+Runs post-gate. Parameters: path to the seed file, overwrite
+policy (upsert-missing only, or overwrite-all).
+
+On clean run, writes the whole file. On flag change, behaviour
+depends on the overwrite policy — typically upsert-missing so
+that operator-customised keys are preserved across seed-file
+upgrades.
+
+### WorkspaceInit
+
+Creates a named workspace and populates it from the seed file or
+from the full contents of the `__template__` workspace.
+
+Runs post-gate. Parameters: workspace name, source (seed file or
+`__template__`), optional `seed_file` path, `overwrite` flag.
+
+When `source` is `template`, the initialiser copies every config
+type and key present in `__template__` — there is no per-type
+selection. Deployments that want to seed only a subset should
+either curate the seed file they feed to `TemplateSeed` or use
+`source: seed-file` directly here.
+
+Raises cleanly if its source does not exist — depends on
+`TemplateSeed` having run in the same cycle or a prior one.
+
+### DefaultFlowStart
+
+Starts a specific flow in a specific workspace using a specific
+blueprint.
+
+Runs post-gate. Parameters: workspace name, flow id, blueprint
+name, description, optional parameter overrides.
+
+Separated from `WorkspaceInit` deliberately so that deployments
+which want a workspace without an auto-started flow can simply omit
+this initialiser from their bootstrap configuration.
+
+## Extensibility
+
+New initialisers are added by:
+
+1. Subclassing the initialiser base class.
+2. Implementing `run(ctx, old_flag, new_flag)`.
+3. Choosing `wait_for_services` (almost always `True`).
+4. Adding an entry in the bootstrapper's configuration with the new
+   class's dotted path.
+
+No core code changes are required to add an enterprise or third-party
+initialiser. Enterprise builds ship their own package with their own
+initialiser classes (e.g. `CreateAdminUser`, `ProvisionWorkspaces`)
+and reference them in the bootstrapper config alongside the core
+initialisers.
+
+## Reserved Workspaces
+
+This specification relies on the "reserved workspace" convention:
+
+- Any workspace id beginning with `_` is reserved.
+- Reserved workspaces are stored normally by config-svc but never
+  appear in the config push broadcast.
+- Live processors cannot react to reserved-workspace state.
+
+The bootstrapper uses two reserved workspaces:
+
+- `__template__` — factory-default seed config, readable by
+  initialisers that copy-from-template.
+- `__system__` — bootstrapper completion state (under the
+  `init-state` config type) and any other system-internal bookkeeping.
+
+See the reserved-workspace convention in the config service for
+the general rule and its enforcement.
+
+## Non-Goals
+
+- No DAG scheduling across initialisers. Dependencies are expressed
+  by the dependant failing cleanly until its prerequisite is met,
+  and convergence over subsequent cycles.
+- No parallel execution of initialisers within a cycle. A cycle runs
+  each initialiser sequentially.
+- No implicit re-runs. Re-running an initialiser requires an explicit
+  flag change by the operator.
+- No cross-initialiser atomicity. Each initialiser's completion is
+  recorded independently on its own success.
+
+## Operational Notes
+
+- Running the bootstrapper as a processor-group entry replaces the
+  previous `tg-init-trustgraph` container. The bootstrapper is also
+  CLI-invocable directly for standalone testing via
+  `Processor.launch(...)`.
+- First-boot convergence is typically a handful of short cycles
+  followed by a transition to the steady cadence. Deployments
+  should expect the first few minutes of logs to show
+  initialisation activity, thereafter effective silence.
+- Bumping a flag is a deliberate operational act. The log line
+  emitted on re-run makes the event visible for audit.
--- a/docs/tech-specs/capabilities.md
+++ b/docs/tech-specs/capabilities.md
@ -0,0 +1,273 @@
+---
+layout: default
+title: "Capability Vocabulary Technical Specification"
+parent: "Tech Specs"
+---
+
+# Capability Vocabulary Technical Specification
+
+## Overview
+
+Every gateway endpoint maps to exactly one *capability* — a string
+from a closed vocabulary defined in this document.  When the
+gateway authorises a request, it hands the IAM regime four things:
+the authenticated identity, the required capability, the
+operation's resource (the structured identifier of what's being
+operated on), and the operation's parameters.  The IAM regime
+decides allow or deny; see the [IAM contract](iam-contract.md) for
+the full abstraction.
+
+A capability is a **permission**, not a structural classification.
+`graph:read` says "the caller may read graphs"; it does not say
+where graphs live or how they are addressed.  The shape of a
+request — whether workspace appears in the URL, the envelope, or
+the body, and whether it is a resource address component or an
+operation parameter — is determined by what the operation operates
+on, not by what permission it requires.  Permission and structure
+are orthogonal; the contract takes both.
+
+This document defines:
+
+- The **capability vocabulary** — the closed list of capability
+  strings the gateway uses as input to `authorise`.  All IAM
+  regimes share this vocabulary; that's the only schema the
+  gateway and the IAM regime have to agree on.
+- The **open-source role bundles** — the role-and-scope table the
+  OSS IAM regime uses to answer `authorise` calls.  Other regimes
+  answer the same call differently; the bundles below are an
+  OSS-specific implementation detail, not a contract assertion.
+
+A regime may evaluate `authorise` using role bundles (OSS), IdP
+group memberships, attribute-based policies, relationship tuples,
+or any other mechanism.  The gateway is unaware of which.  The
+capability strings — and the resource component vocabulary the
+gateway populates alongside them — are the only thing both sides
+have to agree on.
+
+## Motivation
+
+The original IAM spec used hierarchical "minimum role" checks
+(`admin` implies `writer` implies `reader`). That shape is simple
+but paints the role model into a corner: any enterprise need to
+grant a subset of admin abilities (helpdesk that can reset
+passwords but not edit flows; analyst who can query but not ingest)
+requires a protocol-level change.
+
+A capability vocabulary decouples "what a request needs" from
+"what roles a user has" and makes the role table pure data. The
+open-source bundles can stay coarse while the enterprise role
+table expands without any code movement.
+
+## Design
+
+### Capability string format
+
+`<subsystem>:<verb>` or `<subsystem>` (for capabilities with no
+natural read/write split). All lowercase, kebab-case for
+multi-word subsystems.
+
+### Capability list
+
+**Data plane**
+
+| Capability | Covers |
+|---|---|
+| `agent` | agent (query-only; no write counterpart) |
+| `graph:read` | graph-rag, graph-embeddings-query, triples-query, sparql, graph-embeddings-export, triples-export |
+| `graph:write` | triples-import, graph-embeddings-import |
+| `documents:read` | document-rag, document-embeddings-query, document-embeddings-export, entity-contexts-export, document-stream-export, library list / fetch |
+| `documents:write` | document-embeddings-import, entity-contexts-import, text-load, document-load, library add / replace / delete |
+| `rows:read` | rows-query, row-embeddings-query, nlp-query, structured-query, structured-diag |
+| `rows:write` | rows-import |
+| `llm` | text-completion, prompt (stateless invocation) |
+| `embeddings` | Raw text-embedding service (stateless compute; typed-data embedding stores live under their data-subject capability) |
+| `mcp` | mcp-tool |
+| `collections:read` | List / describe collections |
+| `collections:write` | Create / delete collections |
+| `knowledge:read` | List / get knowledge cores |
+| `knowledge:write` | Create / delete knowledge cores |
+
+**Control plane**
+
+| Capability | Covers |
+|---|---|
+| `config:read` | Read workspace config |
+| `config:write` | Write workspace config |
+| `flows:read` | List / describe flows, blueprints, flow classes |
+| `flows:write` | Start / stop / update flows |
+| `users:read` | List / get users within the workspace |
+| `users:write` | Create / update / disable users within the workspace |
+| `users:admin` | Assign / remove roles on users within the workspace |
+| `keys:self` | Create / revoke / list **own** API keys |
+| `keys:admin` | Create / revoke / list **any user's** API keys within the workspace |
+| `workspaces:admin` | Create / delete / disable workspaces (system-level) |
+| `iam:admin` | JWT signing-key rotation, IAM-level operations |
+| `metrics:read` | Prometheus metrics proxy |
+
+### Open-source role bundles
+
+The open-source edition ships three roles:
+
+| Role | Capabilities |
+|---|---|
+| `reader` | `agent`, `graph:read`, `documents:read`, `rows:read`, `llm`, `embeddings`, `mcp`, `collections:read`, `knowledge:read`, `flows:read`, `config:read`, `keys:self` |
+| `writer` | everything in `reader` **+** `graph:write`, `documents:write`, `rows:write`, `collections:write`, `knowledge:write` |
+| `admin` | everything in `writer` **+** `config:write`, `flows:write`, `users:read`, `users:write`, `users:admin`, `keys:admin`, `workspaces:admin`, `iam:admin`, `metrics:read` |
+
+Open-source bundles are deliberately coarse. `workspaces:admin` and
+`iam:admin` live inside `admin` without a separate role; a single
+`admin` user holds the keys to the whole deployment.
+
+### The `agent` capability and composition
+
+The `agent` capability is granted independently of the capabilities
+it composes under the hood (`llm`, `graph`, `documents`, `rows`,
+`mcp`, etc.). A user holding `agent` but not `llm` can still cause
+LLM invocations because the agent implementation chooses which
+services to invoke on the caller's behalf.
+
+This is deliberate. A common policy is "allow controlled access
+via the agent, deny raw model calls" — granting `agent` without
+granting `llm` expresses exactly that. An administrator granting
+`agent` should treat it as a grant of everything the agent
+composes at deployment time.
+
+### Authorisation evaluation (OSS regime)
+
+This section describes how the OSS IAM regime answers
+`authorise(identity, capability, resource, parameters)`.  Other
+regimes answer the same contract differently; only the inputs (the
+capability vocabulary, the resource components, the parameter
+shape) are shared.
+
+For a request bearing a resolved set of roles
+`R = {r1, r2, ...}`, a required capability `c`, a resource, and
+parameters:
+
+```
+let target_workspace =
+        resource.workspace                  (workspace-/flow-level resources)
+        or parameters.workspace             (system-level resources whose
+                                             parameters reference a workspace)
+        or unset                            (system-level operations with no
+                                             workspace context)
+
+allow if some role r in R has c in its capability bundle
+        and (target_workspace is unset
+             or r's workspace_scope permits target_workspace)
+```
+
+The OSS regime considers workspace from whichever role it plays in
+the operation:
+
+- For workspace-level and flow-level resources, the workspace lives
+  in `resource.workspace` and that is what the role's scope is
+  checked against.
+- For system-level resources whose operation parameters reference a
+  workspace (e.g. `create-user with workspace association W`),
+  workspace lives in `parameters.workspace` and that is what the
+  role's scope is checked against.  The resource is system-level
+  (`resource = {}`) but the workspace constraint still bites.
+- For system-level operations with no workspace context (e.g.
+  `bootstrap`, `rotate-signing-key`), the workspace-scope check
+  collapses — only capability-bundle membership matters.
+
+No hierarchy, no precedence, no role-order sensitivity.  A user
+with a single role is the common case; a user with multiple roles
+is allowed if any role independently grants both the capability
+and the relevant workspace scope.
+
+### Enforcement boundary
+
+Capability checks — and authentication — are applied **only at the
+API gateway**, on requests arriving from external callers.
+Operations originating inside the platform (backend service to
+backend service, agent to LLM, flow-svc to config-svc, bootstrap
+initialisers, scheduled reconcilers, autonomous flow steps) are
+**not capability-checked**. Backend services trust the workspace
+set by the gateway on inbound pub/sub messages and trust
+internally-originated messages without further authorisation.
+
+This policy has four consequences that are part of the spec, not
+accidents of implementation:
+
+1. **The gateway is the single trust boundary for user
+   authorisation.** Every backend service is a downstream consumer
+   of an already-authorised workspace scope.
+2. **Pub/sub carries workspace, not user identity.** Messages on
+   the bus do not carry credentials or the identity that originated
+   a request; they carry the resolved workspace only. This keeps
+   the bus protocol free of secrets and aligns with the workspace
+   resolver's role as the gateway-side narrowing step.
+3. **Composition is transitive.** Granting a capability that the
+   platform composes internally (for example, `agent`) transitively
+   grants everything that capability composes under the hood,
+   because the downstream calls are internal-origin and are not
+   re-checked. The composite nature of `agent` described above is
+   a consequence of this policy, not a special case.
+4. **Internal-origin operations have no user.** Bootstrap,
+   reconcilers, and other platform-initiated work act with
+   system-level authority. The workspace field on such messages
+   identifies which workspace's data is being touched, not who
+   asked.
+
+**Trust model.** Whoever has pub/sub access is implicitly trusted
+to act as any workspace. Defense-in-depth within the backend is
+not part of this design; the security perimeter is the gateway
+and the bus itself (TLS / network isolation between the bus and
+any untrusted network).
+
+### Unknown capabilities and unknown roles
+
+- An endpoint declaring an unknown capability is a server-side bug
+  and fails closed (403, logged).
+- A user carrying a role name that is not defined in the role table
+  is ignored for authorisation purposes and logged as a warning.
+  Behaviour is deterministic: unknown roles contribute zero
+  capabilities.
+
+### Capability scope
+
+Every capability is **implicitly scoped to the caller's resolved
+workspace**. A `users:write` capability does not permit a user
+in workspace `acme` to create users in workspace `beta` — the
+workspace-resolver has already narrowed the request to one
+workspace before the capability check runs. See the IAM
+specification for the workspace-resolver contract.
+
+The three exceptions are the system-level capabilities
+`workspaces:admin` and `iam:admin`, which operate across
+workspaces by definition, and `metrics:read`, which returns
+process-level series not scoped to any workspace.
+
+## Enterprise extensibility
+
+Enterprise editions extend the role table additively:
+
+```
+data-analyst:   {query, library:read, collections:read, knowledge:read}
+helpdesk:       {users:read, users:write, users:admin, keys:admin}
+data-engineer:  writer + {flows:read, config:read}
+workspace-owner: admin − {workspaces:admin, iam:admin}
+```
+
+None of this requires a protocol change — the wire-protocol `roles`
+field on user records is already a set, the gateway's
+capability-check is already capability-based, and the capability
+vocabulary is closed. Enterprises may introduce roles whose bundles
+compose the same capabilities differently.
+
+When an enterprise introduces a new capability (e.g. for a feature
+that does not exist in open source), the capability string is
+added to the vocabulary and recognised by the gateway build that
+ships that feature.
+
+## References
+
+- [IAM Contract Specification](iam-contract.md) — the abstract
+  gateway↔IAM regime contract; capability strings are inputs to
+  `authorise`.
+- [Identity and Access Management Specification](iam.md)
+- [IAM Service Protocol Specification](iam-protocol.md) — the OSS
+  regime's wire-level protocol.
+- [Architecture Principles](architecture-principles.md)
--- a/docs/tech-specs/data-ownership-model.md
+++ b/docs/tech-specs/data-ownership-model.md
@ -22,8 +22,16 @@ are the boundaries around data, and who owns what?

 A workspace is the primary isolation boundary. It represents an
 organisation, team, or independent operating unit. All data belongs to
-exactly one workspace. Cross-workspace access is never permitted through
-the API.
+exactly one workspace.
+
+Cross-workspace access through the API is gated by the IAM regime
+(see [`iam-contract.md`](iam-contract.md)). In the OSS distribution,
+the role table defined in [`capabilities.md`](capabilities.md)
+permits cross-workspace operation only to the `admin` role; the
+`reader` and `writer` roles are constrained to a single assigned
+workspace per credential. Other regimes can model the relationship
+between identity and workspace differently — the gateway makes no
+assumption.

 A workspace owns:
 - Source documents
@ -279,9 +287,18 @@ A typical workflow:

 The current codebase uses a `user` field in message metadata and storage
 partition keys to identify the workspace. The `collection` field
-identifies the collection within that workspace. The IAM spec describes
-how the gateway maps authenticated credentials to a workspace identity
-and sets these fields.
+identifies the collection within that workspace.
+
+The gateway is the single point at which workspace gets stamped onto
+outbound pub/sub messages.  An incoming credential authenticates to a
+workspace (the credential's binding, not a user-to-workspace lookup —
+see [`iam-contract.md`](iam-contract.md) and the *Identity surface*
+section of [`iam.md`](iam.md)); any caller-supplied workspace on the
+request is reconciled against the authenticated identity by the IAM
+regime; the resolved value is what the gateway writes into outgoing
+messages and the storage layers' partition keys.  Backend services
+trust the workspace they receive — defense-in-depth happens at the
+gateway, not at the bus.

 For details on how each storage backend implements this scoping, see:

@ -302,7 +319,10 @@ For details on how each storage backend implements this scoping, see:

 ## References

- [Identity and Access Management](iam.md)
+- [IAM Contract](iam-contract.md) — gateway↔IAM regime abstraction.
+- [Identity and Access Management](iam.md) — gateway-side framing.
+- [Capability Vocabulary](capabilities.md) — capability strings and
+  the OSS role bundles that decide cross-workspace eligibility.
 - [Collection Management](collection-management.md)
 - [Entity-Centric Graph](entity-centric-graph.md)
 - [Neo4j User Collection Isolation](neo4j-user-collection-isolation.md)
--- a/docs/tech-specs/flow-blueprint-definition.md
+++ b/docs/tech-specs/flow-blueprint-definition.md
--- a/docs/tech-specs/iam-contract.md
+++ b/docs/tech-specs/iam-contract.md
@ -0,0 +1,403 @@
+---
+layout: default
+title: "IAM Contract Technical Specification"
+parent: "Tech Specs"
+---
+
+# IAM Contract Technical Specification
+
+## Overview
+
+The IAM contract is the abstraction between the API gateway and any
+identity / access management regime that fronts it.  The gateway
+treats IAM as a black box behind two operations — *authenticate* and
+*authorise* — plus a small surface of management operations.  No
+regime-specific concept (roles, scopes, groups, claims, policy
+languages) is visible to the gateway, and no gateway-specific
+concept (capability vocabulary, request anatomy) is visible to
+backend services.
+
+The TrustGraph open-source distribution ships one IAM regime — a
+role-based implementation defined in
+[`iam-protocol.md`](iam-protocol.md) — that is one implementation of
+this contract.  Enterprise editions can replace it with a different
+regime (OIDC / SSO, ABAC, ReBAC, external policy engine) without
+changing the gateway, the wire protocol, or the backends.
+
+## Motivation
+
+Authorisation models vary by deployment.  A small team might be
+happy with three predefined roles; an enterprise might need group-
+mapping from an upstream IdP, attribute-based policies, or
+relationship-based access control.  Hard-wiring any one of those
+into the gateway forces every other regime to either compromise its
+model or be re-implemented.
+
+A narrow contract — "authenticate this credential" and "may this
+identity perform this operation on this resource" — captures what
+the gateway actually needs to know without committing to a policy
+shape.  The IAM regime owns the policy decision; the gateway is a
+generic enforcement point.
+
+## Operations
+
+### `authenticate`
+
+```
+authenticate(credential: bytes) → Identity | AuthFailure
+```
+
+Validates a credential the client presented.  The gateway treats
+the credential as opaque bytes — for the OSS regime today that's
+either an API key plaintext or a JWT, but the gateway does not
+parse them; the IAM regime decides.
+
+On success, returns an `Identity`.  On any failure the IAM regime
+returns the same opaque `AuthFailure` — never a description of which
+condition failed.  This is the spec's masked-error rule: an
+attacker probing the endpoint cannot distinguish "no such key",
+"expired", "wrong signature", "revoked", "user disabled", etc.
+
+### `authorise`
+
+```
+authorise(identity: Identity,
+          capability: str,
+          resource: Resource,
+          parameters: dict)
+    → Decision
+```
+
+Asks whether the identity is permitted to perform the named
+capability on the named resource, given the operation's
+parameters.  Returns `allow` or `deny`.  `identity` is whatever
+`authenticate` returned for this caller; the gateway never
+decomposes it.
+
+The four arguments separate concerns:
+
+- **`identity`** — who is asking.
+- **`capability`** — what permission they are exercising (e.g.
+  `users:write`, `graph:read`).  Permission, not structure.
+- **`resource`** — what is being operated on, as a structured
+  identifier.  See *The Resource model* below.
+- **`parameters`** — operation-specific data that the regime may
+  need to consider beyond the resource identifier.  Used when a
+  decision depends on attributes the request supplies — e.g.
+  creating a user *with workspace association W*: the resource is
+  the system-level user registry, and W is a parameter the regime
+  checks against the caller's permissions for `users:write`.
+
+Different regimes use the four arguments differently — one regime
+might evaluate role bundles whose grants carry workspace scope;
+another might consult upstream IdP group memberships; an ABAC
+regime evaluates a policy with all four as inputs.  The contract
+is unchanged.
+
+### `authorise_many`
+
+```
+authorise_many(identity: Identity,
+               checks: list[(str, Resource, dict)])
+    → list[Decision]
+```
+
+Bulk variant of `authorise`.  Same semantics, one round-trip for
+many decisions.  Used when an operation fans out to multiple
+resources (e.g. an agent that touches several workspaces) and a
+single permission check isn't sufficient.
+
+`authorise_many` is not just a performance optimisation; it pins
+the contract for fan-out operations early, before clients (or
+internal callers) build patterns that assume one-permission-check-
+per-request.  Regimes implement it as a loop over `authorise`
+unless they have a more efficient path.
+
+### Management operations
+
+Beyond the request-time `authenticate` / `authorise`, the contract
+also covers identity-lifecycle and credential-lifecycle operations
+that are invoked by administrative requests rather than by the
+authentication path.  These are regime-specific in detail (an SSO
+regime that delegates user management to the IdP may not implement
+most of them) but the operation set the gateway can forward is:
+
+- User management: `create-user`, `list-users`, `get-user`,
+  `update-user`, `disable-user`, `enable-user`, `delete-user`
+- Credential management: `create-api-key`, `list-api-keys`,
+  `revoke-api-key`, `change-password`, `reset-password`
+- Workspace management: `create-workspace`, `list-workspaces`,
+  `get-workspace`, `update-workspace`, `disable-workspace`
+- Session management: `login`, `whoami`
+- Key management: `get-signing-key-public`, `rotate-signing-key`
+- Bootstrap: `bootstrap`, `bootstrap-status`
+
+`whoami` is the self-read counterpart to `get-user`: any
+authenticated caller can read their own identity record without
+holding a user-management capability.  It is the gating-free probe
+a UI uses to render affordances appropriate to the caller's role.
+
+`bootstrap-status` is a side-effect-free probe of whether an
+unconsumed `bootstrap` call would currently succeed.  It exists so
+a first-run UI can decide whether to render setup without invoking
+the consuming `bootstrap` op.  Public — no authentication.
+
+A regime that does not support one of these (e.g. an SSO regime
+where users are managed in the IdP) returns a defined "not
+supported" error; the gateway surfaces it as a 501.
+
+### Actor injection
+
+For any management operation forwarded by the gateway after
+authentication, the gateway injects the authenticated caller's
+`handle` as an `actor` field on the request.  Regimes use `actor`
+to identify *who is making the request* — distinct from the
+operation's target (which lives in `user_id` / `key_id` /
+`workspace_record` / etc.) — for purposes such as:
+
+- Self-service operations (`whoami`, `change-password`) that
+  resolve "the caller" without taking a target argument.
+- Audit logging, where the actor is recorded against the change.
+- Decisions that depend on the resolved resource state.  The
+  gateway authorises against the parameters on the request, but it
+  cannot know the resolved resource's actual properties (e.g. the
+  workspace association of a target user) before the regime loads
+  it.  When that matters, the regime can re-decide using the
+  actor's permissions and the resolved record — closing a class
+  of cases the gateway-side check can't see.
+
+Caller-supplied `actor` values on the request body are overwritten
+by the gateway — the gateway is the only authority for actor
+identity, and a regime that consults `actor` can rely on it being
+authentic.
+
+## The `Identity` surface
+
+`Identity` is *mostly* opaque.  The gateway holds the value as a
+token to quote back when calling `authorise`, never decomposing it.
+But there are a few gateway-side concerns that need a small
+surface:
+
+| Field | Purpose |
+|---|---|
+| `handle` | Opaque reference passed back to `authorise`.  Regime-defined; gateway treats as a string. |
+| `workspace` | The workspace this credential authenticates to.  Used by the gateway only as a default-fill-in for operations that omit a workspace.  Never used as policy input — when authorisation needs to know which workspace the operation acts on, the operation places it in the resource address (or a parameter), and the regime decides. |
+| `principal_id` | Stable identifier the gateway logs for audit (a user id, a sub claim, a service account id).  Never used for authorisation — that's `authorise`'s job. |
+| `source` | How the credential was presented (`api-key`, `jwt`, …).  Non-policy; useful for logs and metrics only. |
+
+Anything else — roles, claims, group memberships, policy attributes
+— stays inside the regime and is reachable only via `authorise`.
+
+## The `Resource` model
+
+A `Resource` is a structured value identifying *what is being
+operated on*.  Resources live at one of three levels in TrustGraph,
+based on where the resource exists in the deployment:
+
+### Resource levels
+
+| Level | What lives there | Resource shape |
+|---|---|---|
+| **System** | The user registry, the workspace registry, the signing key, the audit log — anything that exists once per deployment. | `{}` |
+| **Workspace** | A workspace's config, flow definitions, library (documents), knowledge cores, collections — things that exist *within* a workspace. | `{workspace: "..."}` |
+| **Flow** | A flow's knowledge graph, agent state, LLM context, embedding state, MCP context — things that exist *within* a flow within a workspace. | `{workspace: "...", flow: "..."}` |
+
+Note carefully:
+
+- **Users are a system-level resource.**  A user record exists at
+  the deployment level; the fact that a user has a *workspace
+  association* (one in OSS, possibly many in other regimes) is a
+  property of the user record, not a containment.  Operations on
+  the user registry have `resource = {}`; the workspace
+  association appears as a *parameter*, not as a resource address
+  component.
+- **Workspaces themselves are a system-level resource.** The
+  workspace registry exists at the deployment level.  `create-
+  workspace` and `list-workspaces` are system-level operations;
+  the workspace identifier in their bodies is a parameter, not an
+  address.
+- **A workspace's contents are workspace-level resources.**  A
+  workspace's config, flows, library, etc. live within a
+  workspace.  Their resource address is `{workspace: ...}`.
+- **A flow's contents are flow-level resources.**  Knowledge
+  graphs, agents, etc. live within a flow.  Their resource
+  address is `{workspace: ..., flow: ...}`.
+
+### Component vocabulary
+
+| Component | Type | Meaning | Used by |
+|---|---|---|---|
+| `workspace` | string | Identifier of the workspace whose contents are being operated on | workspace-level and flow-level resource addresses |
+| `flow` | string | Identifier of a flow within a workspace; always paired with `workspace` | flow-level resource addresses |
+| `collection` | string | Reserved for finer-grained scoping within a workspace | future / enterprise |
+| `document` | string | Reserved for per-document scoping | future / enterprise |
+
+A `Resource` is a partial mapping of these components to values.
+The level of the resource (system / workspace / flow) determines
+which components must be present.  An empty `{}` is the
+system-level resource.
+
+### Workspace as parameter vs. address
+
+Workspace plays two distinct roles in operations and shows up in
+two distinct places:
+
+- **As a resource address component** — workspace identifies the
+  thing being operated on.  Lives in `resource.workspace`.  Example:
+  `config:read` reads the config *of* workspace W.
+- **As an operation parameter** — workspace is data the operation
+  acts on or filters by, while the resource itself is system-level.
+  Lives in `parameters.workspace`.  Example: `users:write`
+  creates a user *with workspace association* W; the resource is
+  the user registry (system), and W is a parameter.
+
+These are not interchangeable.  The IAM regime considers each role
+separately; the OSS role table, for instance, applies workspace-
+scope to the address component when checking workspace-level
+operations, and to a parameter when checking
+"create-user-with-workspace-W".  Both end up enforcing the admin's
+scope, but through different code paths.
+
+### Extension rules
+
+The vocabulary is closed but extensible.  Adding a new component:
+
+1. The component is added to the vocabulary in this spec, with a
+   defined name, type, and meaning.
+2. Existing IAM regimes ignore unknown components (forward
+   compatibility — adding a new component does not break older
+   regimes that don't understand it).
+3. Older gateways that don't populate a new component leave it
+   unset; regimes that need it for a decision treat "unset" as
+   "absent" and decide accordingly (typically: cannot grant
+   permission scoped to a component the gateway didn't supply).
+
+A regime that wants stricter behaviour (e.g. fail-closed on
+unknown components rather than ignoring them) declares so as part
+of its own configuration; the contract default is "ignore unknown".
+
+## Operation registry (gateway-side)
+
+Mapping a request onto `(capability, resource, parameters)` is
+service-specific — it cannot be inferred from the capability
+alone.  The gateway maintains an **operation registry** that
+declares, per operation:
+
+- The required capability.
+- The resource level (system / workspace / flow) — determines the
+  shape of the resource identifier.
+- How to extract the resource address components (workspace,
+  flow) from the request — from URL path, WebSocket envelope, or
+  body.
+- Which body fields are operation parameters (and which of those
+  the IAM regime should see in the `parameters` argument).
+
+This registry is part of the gateway's endpoint declarations, not
+part of the IAM contract.  The contract specifies what arguments
+`authorise` receives; how the gateway populates them is its own
+concern.
+
+In the OSS gateway, registry keys follow these conventions:
+
+| Pattern | Used by | Resource level |
+|---|---|---|
+| bare op name (`create-user`, `list-users`, `login`, …) | `/api/v1/iam` and the auth surface | system / workspace, per op |
+| `<kind>:<op>` (`config:get`, `flow:list-blueprints`, `librarian:add-document`, …) | `/api/v1/{kind}` (workspace-scoped global services) | workspace |
+| `flow-service:<kind>` (`flow-service:agent`, `flow-service:graph-rag`, …) | `/api/v1/flow/{flow}/service/{kind}` and the WS Mux | flow |
+| `flow-import:<kind>` / `flow-export:<kind>` | `/api/v1/flow/{flow}/{import,export}/{kind}` streaming sockets | flow |
+
+Keys are an OSS-gateway implementation detail — the contract does
+not constrain naming.  The conventions above exist so the registry
+key is uniquely derivable from the request path and (where
+applicable) body without ambiguity.
+
+## Caching
+
+Both `authenticate` and `authorise` results are cached at the
+gateway, on different policies:
+
+- **`authenticate`** — cached by a hash of the credential.  The OSS
+  gateway uses a fixed short TTL (currently 60 s) so that revoked
+  API keys and disabled users stop working within the TTL window
+  without any push mechanism.  Regimes that want a different
+  behaviour can return an `expires` hint with the identity; the
+  gateway honours the smaller of `expires` and its own ceiling.
+
+- **`authorise`** — cached by a hash of `(handle, capability,
+  resource, parameters)`.  The regime returns a suggested TTL with
+  the decision; the gateway clamps it above by a deployment-set
+  ceiling (currently 60 s).  Both allow and deny decisions are
+  cached; denies briefly, to avoid hammering the regime with
+  repeated rejected attempts.
+
+The TTL ceiling caps the revocation latency window — a role
+revoked at the regime takes effect at the gateway no later than
+the ceiling.  Operators that need stricter revocation can lower
+the ceiling.
+
+## Failure modes
+
+| Condition | Behaviour |
+|---|---|
+| `authenticate` returns AuthFailure | Gateway responds 401 with the masked `auth failure` body. |
+| `authorise` returns deny | Gateway responds 403 with the masked `access denied` body. |
+| IAM regime unreachable | Gateway responds 401 / 503 (deployment-defined).  No fail-open. |
+| `authorise_many` partial deny | Gateway treats the request as denied; the operation is rejected.  Partial-success semantics are not part of the contract. |
+| Regime returns "not supported" for a management operation | Gateway responds 501. |
+
+There is no fallback or "soft" decision path.  An IAM regime that
+is unavailable, slow, or returning errors causes requests to fail
+closed.
+
+## Implementations
+
+### Open-source role-based regime
+
+Defined in [`iam-protocol.md`](iam-protocol.md).  Implements the
+contract via:
+
+- A pub/sub request/response service (`iam-svc`) reached only by
+  the gateway over the message bus.
+- Credentials are API keys (opaque) or JWTs (Ed25519, locally
+  validated by the gateway against the regime's published public
+  key).
+- `authorise` reduces to a lookup against the role bundles in
+  [`capabilities.md`](capabilities.md), with each grant's workspace
+  scope checked against the operation's workspace component.
+- Identity, user, and workspace records live in Cassandra.
+
+The OSS regime is deliberately simple — three roles, a single
+workspace association per user (a regime data-model decision, not
+a contract assertion), no policy language.  Other regimes can
+grant the same user different permissions in different workspaces
+without changing anything outside the regime.
+
+### Future regimes
+
+The contract is shaped to admit, without code change in the
+gateway:
+
+- **OIDC / SSO** — `authenticate` validates an OIDC ID token via
+  the IdP's JWKS; `Identity.handle` carries the verified subject
+  and group claims; `authorise` evaluates against group-to-
+  capability mappings configured at the regime.
+- **ABAC / Policy engine** — `authorise` calls out to a policy
+  engine (Rego, Cedar, custom DSL) with the identity's attributes
+  and the resource as the policy input.
+- **ReBAC (Zanzibar-style)** — `authorise` translates `(identity,
+  capability, resource)` into a relationship-tuple lookup against
+  a tuple store.
+- **Hybrid** — multiple regimes composed: e.g. authenticate via
+  SSO, authorise via local policy.
+
+None of these require gateway changes.  The contract surface is
+the same; the regime is what differs.
+
+## References
+
+- [Identity and Access Management Specification](iam.md) — overall
+  design and the gateway-side framing.
+- [IAM Service Protocol Specification](iam-protocol.md) — the OSS
+  regime's wire-level protocol.
+- [Capability Vocabulary Specification](capabilities.md) — the
+  capability strings the gateway uses as `authorise` input.
--- a/docs/tech-specs/iam-protocol.md
+++ b/docs/tech-specs/iam-protocol.md
@ -0,0 +1,386 @@
+---
+layout: default
+title: "IAM Service Protocol Technical Specification"
+parent: "Tech Specs"
+---
+
+# IAM Service Protocol Technical Specification
+
+## Overview
+
+This document specifies the wire protocol of the **open-source IAM
+regime** — one implementation of the abstract IAM contract defined
+in [`iam-contract.md`](iam-contract.md).  Other regimes (OIDC / SSO,
+ABAC, ReBAC, external policy engines) implement the same contract
+with different transports, data models, and policy semantics; the
+gateway is unaware of which regime it's wired against.
+
+The OSS regime is a backend processor (`iam-svc`) reached over the
+standard request/response pub/sub pattern.  It owns users,
+workspaces, API keys, login credentials, and JWT signing keys, all
+backed by Cassandra.  The API gateway is its only caller.
+
+This document defines:
+
+- the `IamRequest` and `IamResponse` dataclasses on the bus,
+- the operation set the OSS regime implements,
+- per-operation input and output fields,
+- the error taxonomy,
+- the bootstrap modes,
+- the initial HTTP forwarding endpoint used while the protocol is
+  being exercised.
+
+The mapping from this regime onto the abstract contract is direct:
+
+| Contract operation | OSS regime operation |
+|---|---|
+| `authenticate(credential)` | `resolve-api-key` (for API keys); local JWT validation against `get-signing-key-public` (for JWTs) |
+| `authorise(identity, capability, resource, parameters)` | Role-table lookup against the OSS role bundles defined in [`capabilities.md`](capabilities.md), gated by workspace scope.  Workspace can come from the resource address (workspace- and flow-level resources) or from a parameter (system-level resources whose parameters reference a workspace, e.g. `create-user with workspace association W`). |
+| `authorise_many` | Loop over `authorise` |
+| Identity / credential / workspace management | `create-user`, `create-api-key`, etc. as listed below.  These are operations on system-level resources (the user / workspace / credential registries); workspace, where it appears in the body, is a parameter. |
+
+Architectural context — roles, capabilities, workspace as resource
+scope, enforcement boundary — lives in [`iam.md`](iam.md) and
+[`capabilities.md`](capabilities.md).  The contract abstraction
+lives in [`iam-contract.md`](iam-contract.md).
+
+## Transport
+
+- **Request topic:** `request:tg/request/iam-request`
+- **Response topic:** `response:tg/response/iam-response`
+- **Pattern:** request/response, correlated by the `id` message
+  property, the same pattern used by `config-svc` and `flow-svc`.
+- **Caller:** the API gateway only. Under the enforcement-boundary
+  policy (see capabilities spec), the IAM service trusts the bus
+  and performs no per-request authentication or capability check
+  against the caller. The gateway has already evaluated capability
+  membership and workspace scoping before sending the request.
+
+## Dataclasses
+
+### `IamRequest`
+
+```python
+@dataclass
+class IamRequest:
+    # One of the operation strings below.
+    operation: str = ""
+
+    # Scope of this request.  Required on every workspace-scoped
+    # operation.  Omitted (or empty) for system-level ops
+    # (workspace CRUD, signing-key ops, bootstrap, resolve-api-key,
+    # login).
+    workspace: str = ""
+
+    # Acting user id.  Set by the gateway to the authenticated
+    # caller's identity handle for every authenticated request
+    # (overwrites any caller-supplied value — the gateway is the
+    # only authority for actor identity, so handlers can rely on it
+    # being authentic).  Used for audit logging, self-service ops
+    # like ``whoami`` that resolve "the caller", and future actor-
+    # scoped policy checks.  Empty for unauthenticated ops
+    # (``login``, ``bootstrap``, ``bootstrap-status``,
+    # ``get-signing-key-public``, ``resolve-api-key``).  See the
+    # actor-injection rule in the IAM contract spec.
+    actor: str = ""
+
+    # --- identity selectors ---
+    user_id: str = ""
+    username: str = ""          # login; unique within a workspace
+    key_id: str = ""            # revoke-api-key, list-api-keys (own)
+    api_key: str = ""           # resolve-api-key (plaintext)
+
+    # --- credentials ---
+    password: str = ""          # login, change-password (current)
+    new_password: str = ""      # change-password
+
+    # --- user fields ---
+    user: UserInput | None = None       # create-user, update-user
+
+    # --- workspace fields ---
+    workspace_record: WorkspaceInput | None = None   # create-workspace, update-workspace
+
+    # --- api key fields ---
+    key: ApiKeyInput | None = None      # create-api-key
+```
+
+### `IamResponse`
+
+```python
+@dataclass
+class IamResponse:
+    # Populated on success of operations that return them.
+    user: UserRecord | None = None              # create-user, get-user, update-user
+    users: list[UserRecord] = field(default_factory=list)   # list-users
+    workspace: WorkspaceRecord | None = None    # create-workspace, get-workspace, update-workspace
+    workspaces: list[WorkspaceRecord] = field(default_factory=list)  # list-workspaces
+
+    # create-api-key returns the plaintext once.  Never populated
+    # on any other operation.
+    api_key_plaintext: str = ""
+    api_key: ApiKeyRecord | None = None          # create-api-key
+    api_keys: list[ApiKeyRecord] = field(default_factory=list)  # list-api-keys
+
+    # login, rotate-signing-key
+    jwt: str = ""
+    jwt_expires: str = ""        # ISO-8601 UTC
+
+    # get-signing-key-public
+    signing_key_public: str = ""  # PEM
+
+    # resolve-api-key returns who this key authenticates as.
+    resolved_user_id: str = ""
+    resolved_workspace: str = ""
+    resolved_roles: list[str] = field(default_factory=list)
+
+    # reset-password
+    temporary_password: str = ""  # returned once to the operator
+
+    # bootstrap: on first run, the initial admin's one-time API key
+    # is returned for the operator to capture.
+    bootstrap_admin_user_id: str = ""
+    bootstrap_admin_api_key: str = ""
+
+    # bootstrap-status: true iff an unconsumed ``bootstrap`` call
+    # would currently succeed.  Always emitted by the response
+    # translator (the false case is meaningful for first-run UIs).
+    bootstrap_available: bool = False
+
+    # Present on any failed operation.
+    error: Error | None = None
+```
+
+### Value types
+
+```python
+@dataclass
+class UserInput:
+    username: str = ""
+    name: str = ""
+    email: str = ""
+    password: str = ""          # only on create-user; never on update-user
+    roles: list[str] = field(default_factory=list)
+    enabled: bool = True
+    must_change_password: bool = False
+
+@dataclass
+class UserRecord:
+    id: str = ""
+    workspace: str = ""
+    username: str = ""
+    name: str = ""
+    email: str = ""
+    roles: list[str] = field(default_factory=list)
+    enabled: bool = True
+    must_change_password: bool = False
+    created: str = ""           # ISO-8601 UTC
+    # Password hash is never included in any response.
+
+@dataclass
+class WorkspaceInput:
+    id: str = ""
+    name: str = ""
+    enabled: bool = True
+
+@dataclass
+class WorkspaceRecord:
+    id: str = ""
+    name: str = ""
+    enabled: bool = True
+    created: str = ""           # ISO-8601 UTC
+
+@dataclass
+class ApiKeyInput:
+    user_id: str = ""
+    name: str = ""              # operator-facing label, e.g. "laptop"
+    expires: str = ""           # optional ISO-8601 UTC; empty = no expiry
+
+@dataclass
+class ApiKeyRecord:
+    id: str = ""
+    user_id: str = ""
+    name: str = ""
+    prefix: str = ""            # first 4 chars of plaintext, for identification in lists
+    expires: str = ""           # empty = no expiry
+    created: str = ""
+    last_used: str = ""         # empty if never used
+    # key_hash is never included in any response.
+```
+
+## Operations
+
+| Operation | Request fields | Response fields | Notes |
+|---|---|---|---|
+| `login` | `username`, `password`, `workspace` (optional) | `jwt`, `jwt_expires` | If `workspace` omitted, IAM resolves to the user's assigned workspace. |
+| `whoami` | `actor` (gateway-injected) | `user` | Returns the calling user's own record. AUTHENTICATED-only; no `users:read` capability required. |
+| `resolve-api-key` | `api_key` (plaintext) | `resolved_user_id`, `resolved_workspace`, `resolved_roles` | Gateway-internal. Service returns `auth-failed` for unknown / expired / revoked keys. |
+| `change-password` | `user_id`, `password` (current), `new_password` | — | Self-service. IAM validates `password` against stored hash. |
+| `reset-password` | `user_id`, `workspace` (optional integrity check) | `temporary_password` | Admin-initiated. IAM generates a random password, sets `must_change_password=true` on the user, returns the plaintext once. |
+| `create-user` | `workspace`, `user` | `user` | `user.password` is hashed and stored; `user.roles` must be subset of known roles. `workspace` is the new user's home-workspace binding (a required *parameter*, not an address). |
+| `list-users` | `workspace` (optional filter) | `users` | If `workspace` omitted, returns the deployment-wide list. |
+| `get-user` | `user_id`, `workspace` (optional integrity check) | `user` | |
+| `update-user` | `user_id`, `user`, `workspace` (optional integrity check) | `user` | `password` field on `user` is rejected; use `change-password` / `reset-password`. Username is immutable. |
+| `disable-user` | `user_id`, `workspace` (optional integrity check) | — | Soft-delete; sets `enabled=false`. Revokes all the user's API keys. |
+| `enable-user` | `user_id`, `workspace` (optional integrity check) | — | Re-enables a previously disabled user; does not restore API keys. |
+| `delete-user` | `user_id`, `workspace` (optional integrity check) | — | Hard-delete; removes user record, username lookup, and all the user's API keys. |
+| `create-workspace` | `workspace_record` | `workspace` | System-level. |
+| `list-workspaces` | — | `workspaces` | System-level. |
+| `get-workspace` | `workspace_record` (id only) | `workspace` | System-level. |
+| `update-workspace` | `workspace_record` | `workspace` | System-level. |
+| `disable-workspace` | `workspace_record` (id only) | — | System-level. Sets `enabled=false`; revokes all workspace API keys; disables all users in the workspace. |
+| `create-api-key` | `key`, `workspace` (optional integrity check) | `api_key_plaintext`, `api_key` | Plaintext returned **once**; only hash stored. `key.name` required. |
+| `list-api-keys` | `user_id`, `workspace` (optional integrity check) | `api_keys` | |
+| `revoke-api-key` | `key_id`, `workspace` (optional integrity check) | — | Deletes the key record. |
+| `get-signing-key-public` | — | `signing_key_public` | Gateway fetches this at startup. |
+| `rotate-signing-key` | — | — | System-level. Introduces a new signing key; old key continues to validate JWTs for a grace period (implementation-defined, minimum 1h). |
+| `bootstrap` | — | `bootstrap_admin_user_id`, `bootstrap_admin_api_key` | If IAM tables are empty and the service is in `bootstrap` mode, creates the initial `default` workspace, an `admin` user, an initial API key, and an initial signing key; returns them once.  Otherwise returns a masked auth failure. |
+| `bootstrap-status` | — | `bootstrap_available` | Side-effect-free probe; `true` iff iam-svc is in `bootstrap` mode and tables are empty.  Intended for first-run UX. |
+
+## Error taxonomy
+
+All errors are carried in the `IamResponse.error` field. `error.type`
+is one of the values below; `error.message` is a human-readable
+string that is **not** surfaced verbatim to external callers (the
+gateway maps to `auth failure` / `access denied` per the IAM error
+policy).
+
+| `type` | When |
+|---|---|
+| `invalid-argument` | Malformed request (missing required field, unknown operation, invalid format). |
+| `not-found` | Named resource does not exist (`user_id`, `key_id`, workspace). |
+| `duplicate` | Create operation collides with an existing resource (username, workspace id, key name). |
+| `auth-failed` | `login` with wrong credentials; `resolve-api-key` with unknown / expired / revoked key; `change-password` with wrong current password. Single bucket to deny oracle attacks. |
+| `weak-password` | Password does not meet policy (length, complexity — policy defined at service level). |
+| `disabled` | Target user or workspace has `enabled=false`. |
+| `operation-not-permitted` | Non-admin attempting system-level operation, or workspace-scoped operation attempting to affect another workspace. |
+| `internal-error` | Unexpected IAM-side failure. Log and surface as 500 at the gateway. |
+
+The gateway is responsible for translating `auth-failed` and
+`operation-not-permitted` into the obfuscated external error
+response (`"auth failure"` / `"access denied"`); `invalid-argument`
+becomes a descriptive 400; `not-found` / `duplicate` /
+`weak-password` / `disabled` become descriptive 4xx but never leak
+IAM-internal detail.
+
+## Credential storage
+
+- **Passwords** are stored using a slow KDF (bcrypt / argon2id — the
+  service picks; documented as an implementation detail). The
+  `password_hash` column stores the full KDF-encoded string
+  (algorithm, cost, salt, hash). Not a plain SHA-256.
+- **API keys** are stored as SHA-256 of the plaintext. API keys
+  are 128-bit random values (`tg_` + base64url); the entropy
+  makes a slow hash unnecessary. The hash serves as the primary
+  key on the `iam_api_keys` table, enabling O(1) lookup on
+  `resolve-api-key`.
+- **JWT signing key** is stored as an RSA or Ed25519 private key
+  (implementation choice) in a dedicated `iam_signing_keys` table
+  with a `kid`, `created`, and optional `retired` timestamp. At
+  most one active key; up to N retired keys are kept for a grace
+  period to validate previously-issued JWTs.
+
+Passwords, API-key plaintext, and signing-key private material are
+never returned in any response other than the explicit one-time
+responses above (`reset-password`, `create-api-key`, `bootstrap`).
+
+## Bootstrap modes
+
+`iam-svc` requires a bootstrap mode to be chosen at startup. There is
+no default — an unset or invalid mode causes the service to refuse
+to start. The purpose is to force the operator to make an explicit
+security decision rather than rely on an implicit "safe" fallback.
+
+| Mode | Startup behaviour | `bootstrap` operation | Suitability |
+|---|---|---|---|
+| `token` | On first start with empty tables, auto-seeds the `default` workspace, admin user, admin API key (using the operator-provided `--bootstrap-token`), and an initial signing key. No-op on subsequent starts. | Refused — returns `auth-failed` / `"auth failure"` regardless of caller. | Production, any public-exposure deployment. |
+| `bootstrap` | No startup seeding. Tables remain empty until the `bootstrap` operation is invoked over the pub/sub bus (typically via `tg-bootstrap-iam`). | Live while tables are empty. Generates and returns the admin API key once. Refused (`auth-failed`) once tables are populated. | Dev / compose up / CI. **Not safe under public exposure** — any caller reaching the gateway's `/api/v1/iam` forwarder before the operator can cause a token to be issued to them. Operators choosing this mode accept that risk. |
+
+### Error masking
+
+In both modes, any refused invocation of the `bootstrap` operation
+returns the same error (`auth-failed` / `"auth failure"`). A caller
+cannot distinguish:
+
+- "service is in token mode"
+- "service is in bootstrap mode but already bootstrapped"
+- "operation forbidden"
+
+This matches the general IAM error-policy stance (see `iam.md`) and
+prevents externally enumerating IAM's state.
+
+### Configuration sources
+
+The mode and token can be supplied two ways.  Resolution order is
+fixed; there is no permissive fallback.
+
+| Source | Field |
+|---|---|
+| Processor-group YAML / CLI argument | `bootstrap_mode`, `bootstrap_token` |
+| Environment variable | `IAM_BOOTSTRAP_MODE`, `IAM_BOOTSTRAP_TOKEN` |
+
+For each setting the service uses the explicit param value if
+present; otherwise the environment variable; otherwise the service
+refuses to start.  The env-var path is intended for the K8s
+deployment pattern where the token is injected from a `Secret` via
+`secretKeyRef`, so the plaintext never has to live in YAML or git.
+A typical production manifest holds `bootstrap_mode: "token"` in
+the YAML and pulls `IAM_BOOTSTRAP_TOKEN` from the Secret; the YAML
+is then safe to version-control.
+
+### Bootstrap-token lifecycle
+
+The bootstrap token — whether operator-supplied (`token` mode) or
+service-generated (`bootstrap` mode) — is a one-time credential. It
+is stored as admin's single API key, tagged `name="bootstrap"`. The
+operator's first admin action after bootstrap should be:
+
+1. Create a durable admin user and API key (or issue a durable API
+   key to the bootstrap admin).
+2. Revoke the bootstrap key via `revoke-api-key`.
+3. Remove the bootstrap token from any deployment configuration
+   (Secret, env var, or YAML field — wherever it was sourced).
+
+The `name="bootstrap"` marker makes bootstrap keys easy to detect in
+tooling (e.g. a `tg-list-api-keys` filter).
+
+## HTTP forwarding (initial integration)
+
+For the initial gateway integration — before the IAM service is
+wired into the authentication middleware — the gateway exposes a
+single forwarding endpoint:
+
+```
+POST /api/v1/iam
+```
+
+- Request body is a JSON encoding of `IamRequest`.
+- Response body is a JSON encoding of `IamResponse`.
+- The gateway's existing authentication (`GATEWAY_SECRET` bearer)
+  gates access to this endpoint so the IAM protocol can be
+  exercised end-to-end in tests without touching the live auth
+  path.
+- This endpoint is **not** the final shape. Once the middleware is
+  in place, per-operation REST endpoints replace it (for example
+  `POST /api/v1/auth/login`, `POST /api/v1/users`, `DELETE
+  /api/v1/api-keys/{id}`), and this generic forwarder is removed.
+
+The endpoint performs only message marshalling: it does not read
+or rewrite fields in the request, and it applies no capability
+check. All authorisation for user / workspace / key management
+lands in the subsequent middleware work.
+
+## Non-goals for this spec
+
+- REST endpoint shape for the final gateway surface — covered in
+  Phase 2 of the IAM implementation plan, not here.
+- OIDC / SAML external IdP protocol — out of scope for open source.
+- Key-signing algorithm choice, password KDF choice, JWT claim
+  layout — implementation details captured in code + ADRs, not
+  locked in the protocol spec.
+
+## References
+
+- [IAM Contract Specification](iam-contract.md) — the abstract
+  gateway↔IAM regime contract this protocol implements.
+- [Identity and Access Management Specification](iam.md)
+- [Capability Vocabulary Specification](capabilities.md)
--- a/docs/tech-specs/iam.md
+++ b/docs/tech-specs/iam.md
@ -199,9 +199,9 @@ The server rejects all non-auth messages until authentication succeeds.
 The socket remains open on auth failure, allowing the client to retry
 with a different token without reconnecting. The client can also send
 a new auth message at any time to re-authenticate — for example, to
-refresh an expiring JWT or to switch workspace. The
-resolved identity (user, workspace, roles) is updated on each
-successful auth.
+refresh an expiring JWT or to switch workspace. The resolved
+identity (handle, workspace, principal_id, source) is updated on
+each successful auth.

 #### API keys

@ -219,7 +219,7 @@ For programmatic access: CLI tools, scripts, and integrations.
 On each request, the gateway resolves an API key by:

 1. Hashing the token.
-2. Checking a local cache (hash → user/workspace/roles).
+2. Checking a local cache (hash → identity).
 3. On cache miss, calling the IAM service to resolve.
 4. Caching the result with a short TTL (e.g. 60 seconds).

@ -233,9 +233,15 @@ For interactive access via the UI or WebSocket connections.
 - A user logs in with username and password. The gateway forwards the
  request to the IAM service, which validates the credentials and
  returns a signed JWT.
- The JWT carries the user ID, workspace, and roles as claims.
+- The JWT carries identity-binding claims only — user id (`sub`)
+  and the workspace this credential authenticates to.  No roles,
+  no policy state.  Per the IAM contract, all policy decisions go
+  through `authorise`; the gateway never reads roles or other
+  regime-internal state from the credential.
 - The gateway validates JWTs locally using the IAM service's public
-  signing key — no service call needed on subsequent requests.
+  signing key — no service call needed for the authentication step;
+  authorisation calls remain per-request (cached per the contract's
+  caching rules).
 - Token expiry is enforced by standard JWT validation at the time the
  request (or WebSocket connection) is made.
 - For long-lived WebSocket connections, the JWT is validated at connect
@ -262,6 +268,26 @@ The gateway forwards this to the IAM service, which validates
 credentials and returns a signed JWT. The gateway returns the JWT to
 the caller.

+#### Self-service: `whoami` and `bootstrap-status`
+
+Two side-effect-free probes that exist to support UI affordances
+without giving the caller broad read access:
+
+- `POST /api/v1/iam` with `{"operation": "whoami"}` — authenticated
+  only.  Returns the caller's own user record (id, username, name,
+  email, workspace, roles, enabled, must_change_password,
+  created).  No `users:read` capability is required, because every
+  authenticated caller can read themselves.  The gateway populates
+  `actor` on the request from the authenticated identity, so the
+  regime resolves "the caller" without taking a target argument.
+
+- `POST /api/v1/auth/bootstrap-status` — public, side-effect-free.
+  Returns `{"bootstrap_available": true|false}`.  `true` iff
+  iam-svc is in `bootstrap` mode and its tables are empty (i.e. an
+  unconsumed `bootstrap` call would currently succeed).  Exists so
+  a first-run UI can decide whether to render the setup flow
+  without invoking the consuming `bootstrap` op.
+
 #### IAM service delegation

 The gateway stays thin. Its authentication logic is:
@ -285,35 +311,82 @@ authentication uses API keys or JWTs. On first start, the bootstrap
 process creates a default workspace and admin user with an initial API
 key.

-### User identity
+### Identity, credentials, and workspace binding

-A user belongs to exactly one workspace. The design supports extending
-this to multi-workspace access in the future (see
-[Extension points](#extension-points)).
+The gateway never asks "which workspace does *this user* belong to?".
+That question forces every IAM regime to expose a user-to-workspace
+mapping, which prevents regimes where the relationship is many-to-many
+or doesn't exist (e.g. SSO with IdP-driven workspace selection).
+Instead, the gateway asks "which workspace does *this credential*
+authenticate to?" — a question every regime can answer in its own
+terms.

-A user record contains:
+A credential (API key, JWT, OIDC token, etc.) is **bound to a
+workspace at issue time**.  The IAM regime decides what binding
+means:
+
+- **OSS regime** — each user has a home workspace; credentials
+  issued to that user are bound to that workspace.  A 1:1
+  user-to-workspace constraint is an internal data-model decision,
+  not a contract assertion.
+- **Multi-workspace regime** (future / enterprise) — a user with
+  access to several workspaces gets a different credential per
+  workspace.  Each credential authenticates to exactly one
+  workspace; the relationship between user and workspace is a
+  regime-internal detail the gateway does not see.
+
+When the gateway authenticates a credential, the IAM regime returns
+an `Identity` whose `workspace` is the workspace this credential is
+for.  That value — not "the user's workspace" — is what the gateway
+uses for default-fill-in and as input to the IAM `authorise` call.
+
+#### Identity surface
+
+What the gateway holds after `authenticate`:
+
+| Field | Purpose |
+|-------|---------|
+| `handle` | Opaque token quoted back when calling `authorise`.  Regime-defined. |
+| `workspace` | The workspace this credential authenticates to.  Used as the default if a request omits workspace. |
+| `principal_id` | Stable identifier for audit logging (a user id, sub claim, service account id).  Never used for authorisation. |
+| `source` | How the credential was presented (`api-key`, `jwt`).  Logged with audit events; not policy input. |
+
+Anything else — roles, claims, group memberships, policy attributes
+— stays inside the regime and is reachable only via `authorise`.
+See [`iam-contract.md`](iam-contract.md) for the full contract.
+
+#### OSS user record
+
+The OSS regime stores the following per user.  These fields are
+**OSS-implementation specifics**, not part of the contract.

 | Field | Type | Description |
 |-------|------|-------------|
 | `id` | string | Unique user identifier (UUID) |
 | `name` | string | Display name |
 | `email` | string | Email address (optional) |
-| `workspace` | string | Workspace the user belongs to |
+| `workspace` | string | Home workspace; default binding for issued credentials |
 | `roles` | list[string] | Assigned roles (e.g. `["reader"]`) |
 | `enabled` | bool | Whether the user can authenticate |
 | `created` | datetime | Account creation timestamp |

-The `workspace` field maps to the existing `user` field in `Metadata`.
-This means the storage-layer isolation (Cassandra, Neo4j, Qdrant
-filtering by `user` + `collection`) works without changes — the gateway
-sets the `user` metadata field to the authenticated user's workspace.
+The `workspace` field on a user record is the **default binding**
+used when issuing credentials, not a constraint visible to the
+gateway.  An enterprise regime may have no user records at all
+(authentication delegated to an IdP).

 ### Workspaces

-A workspace is an isolated data boundary. Users belong to a workspace,
-and all data operations are scoped to it. Workspaces map to the existing
-`user` field in `Metadata` and the corresponding Cassandra keyspace,
-Qdrant collection prefix, and Neo4j property filters.
+A workspace is an isolated data boundary — a tenancy scope in which
+users, flows, configuration, documents, and knowledge graphs live.
+Workspaces map to storage-layer isolation: the `user` field in
+`Metadata`, the corresponding Cassandra keyspace, the Qdrant
+collection prefix, the Neo4j property filter.
+
+Workspace is the most prominent component of an operation's
+**resource scope**: when a request says "do X to Y", workspace is
+part of "Y".  Listing users, creating flows, querying the graph —
+all of these target a specific workspace.

 | Field | Type | Description |
 |-------|------|-------------|
@ -322,57 +395,176 @@ Qdrant collection prefix, and Neo4j property filters.
 | `enabled` | bool | Whether the workspace is active |
 | `created` | datetime | Creation timestamp |

-All data operations are scoped to a workspace. The gateway determines
-the effective workspace for each request as follows:
+#### Default-fill-in

-1. If the request includes a `workspace` parameter, validate it against
-   the user's assigned workspace.
-   - If it matches, use it.
-   - If it does not match, return 403. (This could be extended to
-     check a workspace access grant list.)
-2. If no `workspace` parameter is provided, use the user's assigned
-   workspace.
+If a request omits workspace, the gateway fills it in from the
+authenticated identity's bound workspace (`identity.workspace`)
+before any IAM check runs.  IAM never receives an unresolved
+workspace; every `authorise` call sees a concrete value.

-The gateway sets the `user` field in `Metadata` to the effective
-workspace ID, replacing the caller-supplied `?user=` query parameter.
+#### Authorisation

-This design ensures forward compatibility. Clients that pass a
-workspace parameter will work unchanged if multi-workspace support is
-added later. Requests for an unassigned workspace get a clear 403
-rather than silent misbehaviour.
+Whether the resolved workspace is permitted to be operated on by
+this caller is an **IAM decision**, not a gateway one.  The gateway
+calls `authorise(identity, capability, {workspace: ..., ...})` and
+relays the answer.  In the OSS regime, the regime checks whether
+the caller's permission grants for `<capability>` include this
+workspace — see [`capabilities.md`](capabilities.md).  In other
+regimes the decision could come from group mappings, policies,
+relationship tuples, or anything else the regime models.
+
+### Request anatomy
+
+The shape of a request — where workspace appears, where flow
+appears, where parameters live — follows from **the level of the
+resource being operated on**, not from any single property of the
+request like its URL or its required capability.
+
+Resources live at one of three levels (see also the resource model
+in [`iam-contract.md`](iam-contract.md)):
+
+| Resource level | Examples | Resource address |
+|---|---|---|
+| **System** | The user registry, the workspace registry, the IAM signing key, the audit log | empty `{}` |
+| **Workspace** | A workspace's config, flow definitions, library, knowledge cores, collections | `{workspace: ...}` |
+| **Flow** | A flow's knowledge graph, agent state, LLM context, embeddings, MCP context | `{workspace: ..., flow: ...}` |
+
+For the gateway-to-bus mapping this dictates **where workspace
+lives in the message**, but only when workspace is part of the
+*resource address*.  Workspace can also appear as an *operation
+parameter* on system-level resources (see below).
+
+#### Workspace as address vs. parameter
+
+Two distinct roles, two distinct locations:
+
+- **Workspace as address component.** Workspace identifies the
+  thing being operated on.  Used for workspace-level and flow-level
+  resources.  Lives in the addressing layer of the message — the
+  URL path for HTTP, or the WebSocket envelope alongside `flow` for
+  flow-scoped operations sent through the Mux.
+- **Workspace as operation parameter.** Workspace is data the
+  operation acts on, while the resource itself is system-level.
+  Used for operations on the user registry (`create-user with
+  workspace association W`), the workspace registry (`create-
+  workspace W`), and other system-level operations that happen to
+  reference a workspace.  Lives in the request body or inner WS
+  payload alongside the operation's other parameters.
+
+The two roles never coexist on the same operation.  Either the
+operation addresses something within a workspace (workspace is in
+the address) or it operates on a system-level resource with
+workspace as a parameter (workspace is in the body) or workspace
+is irrelevant (system-level operations like `bootstrap`,
+`rotate-signing-key`, `login` itself).
+
+#### Where workspace lives, by request type
+
+| Request type | Resource level | Workspace lives in |
+|---|---|---|
+| Flow-scoped data plane (`agent`, `graph-rag`, `llm`, `embeddings`, `mcp`, etc.) | Flow | Envelope alongside `flow` (WS) or URL path (HTTP) — part of the address |
+| Workspace-scoped control plane (`config`, `library`, `knowledge`, `collection-management`, flow lifecycle) | Workspace | Body / inner request — part of the address |
+| User registry ops (`create-user`, `list-users`, `disable-user`, etc.) | System | Body — as a *parameter* (the user's workspace association or a list filter) |
+| Workspace registry ops (`create-workspace`, `list-workspaces`, etc.) | System | Body — as a *parameter* (the workspace identifier in `workspace_record`) |
+| Credential ops (`create-api-key`, `revoke-api-key`, `change-password`, `reset-password`) | System | Body — as a *parameter* on ops that have one; absent on `change-password` (target is the caller's identity) |
+| System ops (`bootstrap`, `login`, `rotate-signing-key`, `get-signing-key-public`) | System | Not present at all |
+
+The classification is deliberate.  Users are a global concept that
+*have* a workspace; they don't *live* in one.  An OSS regime has
+1:1 user-to-workspace; a multi-workspace regime maps a user to many
+workspaces; an SSO regime might delegate workspace membership to an
+IdP entirely.  The gateway treats user-registry operations as
+system-level so the contract is the same across regimes — the
+workspace association is a parameter the regime interprets in its
+own terms.
+
+#### HTTP
+
+HTTP routes by URL path, so the address lives in the URL.
+Per-operation REST shape:
+
+- Flow-level: `POST /api/v1/workspaces/{w}/flows/{f}/services/{kind}`
+  — `workspace` and `flow` are URL components.
+- Workspace-level: `POST /api/v1/workspaces/{w}/config`,
+  `/api/v1/workspaces/{w}/library`, etc. — `workspace` is a URL
+  component.
+- System-level: `POST /api/v1/users`, `/api/v1/workspaces`, etc. —
+  no workspace in URL; if the operation references one, it's a
+  field in the body.
+
+`/api/v1/iam` is itself registry-driven: the body's `operation`
+field is looked up against the registry to obtain the capability,
+resource shape, and parameter shape per operation, rather than
+gating the whole endpoint with a single coarse capability.
+
+#### WebSocket Mux
+
+The Mux envelope is the addressing layer for flow-scoped
+operations.  For workspace-level and system-level operations the
+envelope routes by `service` only, and the inner request payload
+carries the address components or parameters as appropriate.  See
+[`iam-contract.md`](iam-contract.md) for the operation-registry
+mechanism the Mux uses to know which fields to read.

 ### Roles and access control

-Three roles with fixed permissions:
+Roles are an OSS-regime concept and live entirely in the IAM
+service.  The gateway does not enumerate or check them — it asks
+`authorise(identity, capability, resource, parameters)` per
+request and the regime maps the caller's roles to a decision.

-| Role | Data operations | Admin operations | System |
-|------|----------------|-----------------|--------|
-| `reader` | Query knowledge graph, embeddings, RAG | None | None |
-| `writer` | All reader operations + load documents, manage collections | None | None |
-| `admin` | All writer operations | Config, flows, collection management, user management | Metrics |
+The OSS regime ships three roles:

-Role checks happen at the gateway before dispatching to backend
-services. Each endpoint declares the minimum role required:
+| Role | Capabilities granted |
+|------|----------------------|
+| `reader` | Read capabilities on data and config (`graph:read`, `documents:read`, `rows:read`, `config:read`, `flows:read`, `knowledge:read`, `collections:read`, `keys:self`, plus the per-service caps `agent`, `llm`, `embeddings`, `mcp`). |
+| `writer` | All reader capabilities, plus `graph:write`, `documents:write`, `rows:write`, `knowledge:write`, `collections:write`. |
+| `admin` | All writer capabilities, plus `config:write`, `flows:write`, `users:read`, `users:write`, `users:admin`, `keys:admin`, `workspaces:admin`, `iam:admin`, `metrics:read`. |

-| Endpoint pattern | Minimum role |
-|-----------------|--------------|
-| `GET /api/v1/socket` (queries) | `reader` |
-| `POST /api/v1/librarian` | `writer` |
-| `POST /api/v1/flow/*/import/*` | `writer` |
-| `POST /api/v1/config` | `admin` |
-| `GET /api/v1/flow/*` | `admin` |
-| `GET /api/metrics` | `admin` |
+Workspace scope is a property of the *grant*, not of the user or
+role.  In the OSS regime each capability granted by `reader` /
+`writer` is scoped to the workspace the user record is associated
+with; capabilities granted by `admin` are scoped to `*` (every
+workspace).  A user is a system-level object — they don't "live
+in" a workspace, they hold permissions whose scope happens to
+reference one.

-Roles are hierarchical: `admin` implies `writer`, which implies
-`reader`.
+The OSS regime is deliberately limited to one workspace association
+per user; future regimes are free to grant the same user different
+permissions in different workspaces, or use a non-workspace scope
+entirely.  This is regime-internal — neither the contract nor the
+gateway carries an assumption either way.
+
+The gateway gates each endpoint by *capability*, not by role.
+Capabilities are declared per operation in the gateway's operation
+registry; see [`iam-contract.md`](iam-contract.md) for the
+registry mechanism and [`capabilities.md`](capabilities.md) for
+the capability vocabulary.

 ### IAM service

-The IAM service is a new backend service that manages all identity and
-access data. It is the authority for users, workspaces, API keys, and
-credentials. The gateway delegates to it.
+The IAM service is a backend service that implements the
+[IAM contract](iam-contract.md) — `authenticate`, `authorise`, and
+the management operations the gateway forwards.  It is the
+authority for identity, credential validation, and access decisions.
+The gateway treats it as a black box behind the contract; nothing
+in the gateway is regime-specific.

-#### Data model
+The OSS distribution ships one IAM regime: a role-based service
+backed by Cassandra, described in
+[`iam-protocol.md`](iam-protocol.md).  Enterprise / future regimes
+can replace this implementation without changing the gateway, the
+wire protocol between gateway and backends, or the capability
+vocabulary — see the contract spec for the abstraction the gateway
+is wired against and the implementation notes for what other
+regimes look like.
+
+#### OSS data model
+
+The OSS regime stores users, workspaces, API keys, and signing
+keys in Cassandra.  This is an **OSS regime implementation
+detail**; it is not part of the contract.  Other regimes will have
+different (or no) data models.

 ```
 iam_workspaces (
@ -423,44 +615,89 @@ resolve API keys and to handle login requests. User management
 operations (create user, revoke key, etc.) also go through the IAM
 service.

+### Error policy
+
+External error responses carry **no diagnostic detail** for
+authentication or access-control failures. The goal is to give an
+attacker probing the endpoint no signal about which condition they
+tripped.
+
+| Category | HTTP | Body | WebSocket frame |
+|----------|------|------|-----------------|
+| Authentication failure | `401 Unauthorized` | `{"error": "auth failure"}` | `{"type": "auth-failed", "error": "auth failure"}` |
+| Access control failure | `403 Forbidden` | `{"error": "access denied"}` | `{"error": "access denied"}` (endpoint-specific frame type) |
+
+"Authentication failure" covers missing credential, malformed
+credential, invalid signature, expired token, revoked API key, and
+unknown API key — all indistinguishable to the caller.
+
+"Access control failure" covers role insufficient, workspace
+mismatch, user disabled, and workspace disabled — all
+indistinguishable to the caller.
+
+**Server-side logging is richer.** The audit log records the specific
+reason (`"workspace-mismatch: user alice assigned 'acme', requested
+'beta'"`, `"role-insufficient: admin required, user has writer"`,
+etc.) for operators and post-incident forensics. These messages never
+appear in responses.
+
+Other error classes (bad request, internal error) remain descriptive
+because they do not reveal anything about the auth or access-control
+surface — e.g. `"missing required field 'workspace'"` or
+`"invalid JSON"` is fine.
+
 ### Gateway changes

-The current `Authenticator` class is replaced with a thin authentication
-middleware that delegates to the IAM service:
+The current `Authenticator` class is replaced with a thin
+authentication+authorisation middleware that delegates to the IAM
+service per the IAM contract.  The gateway performs no role check
+itself — authorisation is asked of the regime via `authorise`.

 For HTTP requests:

 1. Extract Bearer token from the `Authorization` header.
 2. If the token has JWT format (dotted structure):
   - Validate signature locally using the cached public key.
-   - Extract user ID, workspace, and roles from claims.
+   - Build an `Identity` from `sub` and `workspace` claims (no
+     other claims are consulted).
 3. Otherwise, treat as an API key:
   - Hash the token and check the local cache.
-   - On cache miss, call the IAM service to resolve.
-   - Cache the result (user/workspace/roles) with a short TTL.
+   - On cache miss, call the IAM service to resolve to an
+     `Identity` (handle, workspace, principal_id, source).
+   - Cache the result with a short TTL.
 4. If neither succeeds, return 401.
-5. If the user or workspace is disabled, return 403.
-6. Check the user's role against the endpoint's minimum role. If
-   insufficient, return 403.
-7. Resolve the effective workspace:
-   - If the request includes a `workspace` parameter, validate it
-     against the user's assigned workspace. Return 403 on mismatch.
-   - If no `workspace` parameter, use the user's assigned workspace.
-8. Set the `user` field in the request context to the effective
-   workspace ID. This propagates through `Metadata` to all downstream
-   services.
+5. Look up the operation in the gateway's operation registry to get
+   `(capability, resource_level, extractors)`.  Build the resource
+   address (system / workspace / flow level) and parameters from
+   the request.
+6. Default-fill the workspace into the body when the operation is
+   workspace- or flow-level (so downstream code sees a single
+   canonical address); the resource address keeps its supplied
+   value.
+7. Call `authorise(identity, capability, resource, parameters)`.
+   On allow, forward the request; on deny, return 403.  On regime
+   error, fail closed (401 / 503 per deployment).
+8. Cache the decision per the contract's caching rules (clamped
+   above by a deployment-set ceiling).
+9. For requests forwarded to iam-svc, set `actor` on the body
+   from `identity.handle`, overwriting any caller-supplied value.
+   See [`iam-contract.md`](iam-contract.md#actor-injection).

 For WebSocket connections:

 1. Accept the connection in an unauthenticated state.
 2. Wait for an auth message (`{"type": "auth", "token": "..."}`).
-3. Validate the token using the same logic as steps 2-7 above.
+3. Validate the token using the same logic as steps 1-3 above.
 4. On success, attach the resolved identity to the connection and
   send `{"type": "auth-ok", ...}`.
 5. On failure, send `{"type": "auth-failed", ...}` but keep the
   socket open.
 6. Reject all non-auth messages until authentication succeeds.
 7. Accept new auth messages at any time to re-authenticate.
+8. For each subsequent request frame, look up
+   `flow-service:<service>` in the registry and call `authorise`
+   against the `{workspace, flow}` resource — same authority
+   gateway HTTP callers see, evaluated per-frame.

 ### CLI changes

@ -713,6 +950,16 @@ These are not implemented but the architecture does not preclude them:
 - **Multi-workspace access.** Users could be granted access to
  additional workspaces beyond their primary assignment. The workspace
  validation step checks a grant list instead of a single assignment.
+- **Workspace resolver.** Workspace resolution on each authenticated
+  request — "given this user and this requested workspace, which
+  workspace (if any) may the request operate on?" — is encapsulated
+  in a single pluggable resolver. The open-source edition ships a
+  resolver that permits only the user's single assigned workspace;
+  enterprise editions that implement multi-workspace access swap in a
+  resolver that consults a permitted set. The wire protocol (the
+  optional `workspace` field on the authenticated request) is
+  identical in both editions, so clients written against one edition
+  work unchanged against the other.
 - **Rules-based access control.** A separate access control service
  could evaluate fine-grained policies (per-collection permissions,
  operation-level restrictions, time-based access). The gateway
@ -848,10 +1095,15 @@ service, not in the config service. Reasons:
 - **API key scoping.** API keys could be scoped to specific collections
  within a workspace rather than granting workspace-wide access. To be
  designed when the need arises.
- **tg-init-trustgraph** only initialises a single workspace.

 ## References

+- [IAM Contract Specification](iam-contract.md) — the gateway↔IAM
+  regime abstraction this design is wired against.
+- [IAM Service Protocol Specification](iam-protocol.md) — the OSS
+  regime's wire-level protocol.
+- [Capability Vocabulary Specification](capabilities.md) — the
+  capability strings the gateway uses as `authorise` input.
 - [Data Ownership and Information Separation](data-ownership-model.md)
 - [MCP Tool Bearer Token Specification](mcp-tool-bearer-token.md)
 - [Multi-Tenant Support Specification](multi-tenant-support.md)
--- a/iam-testing.txt
+++ b/iam-testing.txt
@ -0,0 +1,252 @@
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation": "bootstrap"}'
+
+
+
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation": "resolve-api-key", "api_key": "tg_r-n43hDWV9WOY06w6o5YpevAxirlS33D"}'
+
+
+
+
+
+
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation": "resolve-api-key", "api_key": "asdalsdjasdkasdasda"}'
+
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"list-users","workspace":"default"}'
+
+
+
+  # 1. Admin creates a writer user "alice"
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{
+      "operation": "create-user",
+      "workspace": "default",
+      "user": {
+        "username": "alice",
+        "name": "Alice",
+        "email": "alice@example.com",
+        "password": "changeme",
+        "roles": ["writer"]
+      }
+    }'
+  # expect: {"user": {"id": "<alice-uuid>", ...}}  — grab alice's uuid                                                                           
+                                                                                                                                                 
+  # 2. Issue alice an API key                                                                                                                    
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{
+      "operation": "create-api-key",
+      "workspace": "default",
+      "key": {
+        "user_id": "f2363a10-3b83-44ea-a008-43caae8ba607",
+        "name": "alice-laptop"
+      }
+    }'
+  # expect: {"api_key_plaintext": "tg_...", "api_key": {"id": "<key-uuid>", "prefix": "tg_xxxx", ...}}                                           
+                                                                                                                                                 
+  # 3. Resolve alice's key — should return alice's id + workspace + writer role                                                                  
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"resolve-api-key","api_key":"tg_gt4buvk5NG-QS7oP_0Gk5yTWyj1qensf"}'
+
+  # expect: {"resolved_user_id":"<alice-uuid>","resolved_workspace":"default","resolved_roles":["writer"]}                                       
+                                                                                                                                                 
+  # 4. List alice's keys (admin view of alice's keys)
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"list-api-keys","workspace":"default","user_id":"f2363a10-3b83-44ea-a008-43caae8ba607"}'
+  # expect: {"api_keys": [{"id":"<key-uuid>","user_id":"<alice-uuid>","name":"alice-laptop","prefix":"tg_xxxx",...}]}                            
+                                                                                                                                                 
+  # 5. Revoke alice's key                                                                                                                        
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"revoke-api-key","workspace":"default","key_id":"55f1c1f7-5448-49fd-9eda-56c192b61177"}'
+
+
+  # expect: {}  (empty, no error)                                                                                                                
+                                                                      
+  # 6. Confirm the revoked key no longer resolves                                                                                                
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"resolve-api-key","api_key":"tg_gt4buvk5NG-QS7oP_0Gk5yTWyj1qensf"}'
+  # expect: {"error":{"type":"auth-failed","message":"unknown api key"}}                                                                         
+
+
+
+----------------------------------------------------------------------------
+
+  You'll want to re-bootstrap a fresh deployment to pick up the new signing-key row (or accept that login will lazily generate one on first      
+  call). Then:                                                        
+                                                                                                                                                 
+  # 1. Create a user with a known password (admin's password is random)
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"create-user","workspace":"default","user":{"username":"alice","password":"s3cret","roles":["writer"]}}'
+
+
+                                                                                                                                                 
+  # 2. Log alice in                                                                                                                              
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"login","username":"alice","password":"s3cret"}'
+  # expect: {"jwt":"eyJ...","jwt_expires":"2026-..."}                                                                                            
+                                                                                                                                                 
+  # 3. Fetch the public key (what the gateway will use later to verify)
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"get-signing-key-public"}'
+
+  # expect: {"signing_key_public":"-----BEGIN PUBLIC KEY-----\n..."}                                                                             
+                                                                                                                                                 
+  # 4. Wrong password
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Authorization: Bearer $GATEWAY_SECRET" \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"login","username":"alice","password":"nope"}'
+
+
+
+  # expect: {"error":{"type":"auth-failed","message":"bad credentials"}}                                                                         
+
+
+
+
+
+-----BEGIN PUBLIC KEY-----
+MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAseLB/a9Bo/RN/Rb/x763
+vdxmUKG75oWsXBmbwZGDXyN6fwqZ3L7cEje93qK0PYFuCHxhY1Hn0gW7FZ8ovH+
+qEksekUlpfPYqKGiT5Mb0DKk49D4yKkIbJFugWalpwIilvRbQO0jy3V8knqGQ1xL
+NfNYFrI2Rxe0Tq2OHVYc5YwYbyj1nz2TY5fd9qrzXtGRv5HZztkl25lWhRvG9G0K
+urKDdBDbi894gIYorXvcwZw/b1GDXG/aUy/By1Oy3hXnCLsN8pA3nA437TTTWxHx
+QgPH15jIF9hezO+3/ESZ7EhVEtgmwTxPddfXRa0ZoT6JyWOgcloKtnP4Lp9eQ4va
+yQIDAQAB
+-----END PUBLIC KEY-----
+
+
+
+
+
+  New operations:                                                                                                                                
+  - change-password — self-service. Requires current + new password.
+  - reset-password — admin-driven. Generates a random temporary, sets must_change_password=true, returns plaintext once.                         
+  - get-user, update-user, disable-user — workspace-scoped. update-user refuses to change username (immutable — error if different) and refuses
+  password-via-update. disable-user also revokes all the user's API keys, per spec.                                                              
+  - create-workspace, list-workspaces, get-workspace, update-workspace, disable-workspace — system-level. disable-workspace cascades: disables   
+  all users + revokes all their keys. Rejects ids starting with _ (reserved, per the bootstrap framework convention).                            
+  - rotate-signing-key — generates a new Ed25519 key, retires the current one (sets retired timestamp; row stays for future grace-period         
+  validation), switches the in-memory cache.                          
+                                                                                                                                                 
+  Touched files:                                                      
+  - trustgraph-flow/trustgraph/tables/iam.py — added retire_signing_key, update_user_profile, update_user_password, update_user_enabled,         
+  update_workspace.                                                                                                                              
+  - trustgraph-flow/trustgraph/iam/service/iam.py — 12 new handlers + dispatch entries.
+  - trustgraph-base/trustgraph/base/iam_client.py — matching client helpers for all of them.                                                     
+                                                                                                                                                 
+  Smoke-test suggestions:
+                                                                                                                                                 
+  # change password for alice (from "s3cret" → "n3wer")               
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"change-password","user_id":"b2960feb-caef-401d-af65-01bdb6960cad","password":"s3cret","new_password":"n3wer"}'
+                                                                                                                                                 
+  # login with new password                                                                                                                      
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"login","username":"alice","password":"n3wer"}'
+
+  # admin resets alice's password                                                                                                                
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"reset-password","workspace":"default","user_id":"b2960feb-caef-401d-af65-01bdb6960cad"}'
+
+
+  # → {"temporary_password":"..."}                                                                                                               
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"login","username":"alice","password":"fH2ttyrIcVXCIkH_"}'
+
+                                                                                                                                                 
+  # create a second workspace                                                                                                                    
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"create-workspace","workspace_record":{"id":"acme","name":"Acme Corp","enabled":true}}'
+
+
+  # rotate signing key (next login produces a JWT signed by a new kid)
+
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"rotate-signing-key"}'
+                                                                      
+
+
+
+
+
+  curl -s -X POST "http://localhost:8088/api/v1/flow" \
+    -H "Authorization: Bearer tg_bs_kBAhfejiEJmbcO1gElbxk3MpV7wQFygP" \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"list-flows"}'
+
+  curl -s -X POST "http://localhost:8088/api/v1/iam" \
+    -H "Authorization: Bearer tg_bs_kBAhfejiEJmbcO1gElbxk3MpV7wQFygP" \
+    -H "Content-Type: application/json" \
+    -d '{"operation":"list-users"}'
+
+
+
+  curl -s -X POST http://localhost:8088/api/v1/iam \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer tg_bs_kBAhfejiEJmbcO1gElbxk3MpV7wQFygP" \
+    -d '{
+      "operation": "create-user",
+      "workspace": "default",
+      "user": {
+        "username": "alice",
+        "name": "Alice",
+        "email": "alice@example.com",
+        "password": "s3cret",
+        "roles": ["writer"]
+      }
+    }'
+
+
+
+
+  # Login (public, no token needed) → returns a JWT                                                                                              
+  curl -s -X POST "http://localhost:8088/api/v1/auth/login" \
+    -H "Content-Type: application/json" \
+    -d '{"username":"alice","password":"s3cret"}'
+
+
+
+  export TRUSTGRAPH_TOKEN=$(tg-bootstrap-iam)   # on fresh bootstrap-mode deployment
+  # or set to your existing admin API key
+
+  tg-create-user --username alice --roles writer
+  # → prints alice's user id
+
+  ALICE_ID=<uuid from above>
+
+  ALICE_KEY=$(tg-create-api-key --user-id $ALICE_ID --name alice-laptop)
+  # → alice's plaintext API key
+
+  tg-list-users
+  tg-list-api-keys --user-id $ALICE_ID
+
+  tg-revoke-api-key --key-id <...>
+  tg-disable-user --user-id $ALICE_ID
+
+  # User self-service:
+  tg-login --username alice          # prompts for password, prints JWT
+  tg-change-password                  # prompts for current + new
+
+
--- a/tests/unit/test_embeddings/test_ollama_dynamic_model.py
+++ b/tests/unit/test_embeddings/test_ollama_dynamic_model.py
@ -14,13 +14,13 @@ from trustgraph.embeddings.ollama.processor import Processor
 class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase):
    """Test Ollama dynamic model selection"""

-    @patch('trustgraph.embeddings.ollama.processor.Client')
+    @patch('trustgraph.embeddings.ollama.processor.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__')
    async def test_client_initialized_with_host(self, mock_embeddings_init, mock_async_init, mock_client_class):
        """Test that Ollama client is initialized with correct host"""
        # Arrange
-        mock_ollama_client = Mock()
+        mock_ollama_client = AsyncMock()
        mock_response = Mock()
        mock_response.embeddings = [[0.1, 0.2, 0.3, 0.4, 0.5]]
        mock_ollama_client.embed.return_value = mock_response
@ -36,13 +36,13 @@ class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase):
        mock_client_class.assert_called_once_with(host="http://localhost:11434")
        assert processor.default_model == "test-model"

-    @patch('trustgraph.embeddings.ollama.processor.Client')
+    @patch('trustgraph.embeddings.ollama.processor.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__')
    async def test_on_embeddings_uses_default_model(self, mock_embeddings_init, mock_async_init, mock_client_class):
        """Test that on_embeddings uses default model when no model specified"""
        # Arrange
-        mock_ollama_client = Mock()
+        mock_ollama_client = AsyncMock()
        mock_response = Mock()
        mock_response.embeddings = [[0.1, 0.2, 0.3, 0.4, 0.5]]
        mock_ollama_client.embed.return_value = mock_response
@ -62,13 +62,13 @@ class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase):
        )
        assert result == [[0.1, 0.2, 0.3, 0.4, 0.5]]

-    @patch('trustgraph.embeddings.ollama.processor.Client')
+    @patch('trustgraph.embeddings.ollama.processor.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__')
    async def test_on_embeddings_uses_specified_model(self, mock_embeddings_init, mock_async_init, mock_client_class):
        """Test that on_embeddings uses specified model when provided"""
        # Arrange
-        mock_ollama_client = Mock()
+        mock_ollama_client = AsyncMock()
        mock_response = Mock()
        mock_response.embeddings = [[0.1, 0.2, 0.3, 0.4, 0.5]]
        mock_ollama_client.embed.return_value = mock_response
@ -88,13 +88,13 @@ class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase):
        )
        assert result == [[0.1, 0.2, 0.3, 0.4, 0.5]]

-    @patch('trustgraph.embeddings.ollama.processor.Client')
+    @patch('trustgraph.embeddings.ollama.processor.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__')
    async def test_multiple_model_switches(self, mock_embeddings_init, mock_async_init, mock_client_class):
        """Test switching between multiple models"""
        # Arrange
-        mock_ollama_client = Mock()
+        mock_ollama_client = AsyncMock()
        mock_response = Mock()
        mock_response.embeddings = [[0.1, 0.2, 0.3, 0.4, 0.5]]
        mock_ollama_client.embed.return_value = mock_response
@ -118,13 +118,13 @@ class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase):
        assert calls[2][1]['model'] == "model-a"
        assert calls[3][1]['model'] == "test-model"  # Default

-    @patch('trustgraph.embeddings.ollama.processor.Client')
+    @patch('trustgraph.embeddings.ollama.processor.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__')
    async def test_none_model_uses_default(self, mock_embeddings_init, mock_async_init, mock_client_class):
        """Test that None model parameter falls back to default"""
        # Arrange
-        mock_ollama_client = Mock()
+        mock_ollama_client = AsyncMock()
        mock_response = Mock()
        mock_response.embeddings = [[0.1, 0.2, 0.3, 0.4, 0.5]]
        mock_ollama_client.embed.return_value = mock_response
@ -143,13 +143,13 @@ class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase):
            input=["test text"]
        )

-    @patch('trustgraph.embeddings.ollama.processor.Client')
+    @patch('trustgraph.embeddings.ollama.processor.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__')
    async def test_initialization_without_model_uses_default(self, mock_embeddings_init, mock_async_init, mock_client_class):
        """Test initialization without model parameter uses module default"""
        # Arrange
-        mock_ollama_client = Mock()
+        mock_ollama_client = AsyncMock()
        mock_client_class.return_value = mock_ollama_client
        mock_async_init.return_value = None
        mock_embeddings_init.return_value = None
--- a/tests/unit/test_gateway/test_auth.py
+++ b/tests/unit/test_gateway/test_auth.py
@ -1,69 +1,447 @@
 """
-Tests for Gateway Authentication
+Tests for gateway/auth.py — IamAuth, JWT verification, API key
+resolution cache.
+
+JWTs are signed with real Ed25519 keypairs generated per-test, so
+the crypto path is exercised end-to-end without mocks.  API-key
+resolution is tested against a stubbed IamClient since the real
+one requires pub/sub.
 """

+import base64
+import json
+import time
+from unittest.mock import AsyncMock, Mock, patch
+
 import pytest
+from aiohttp import web
+from cryptography.hazmat.primitives import serialization
+from cryptography.hazmat.primitives.asymmetric import ed25519

-from trustgraph.gateway.auth import Authenticator
+from trustgraph.gateway.auth import (
+    IamAuth, Identity,
+    _b64url_decode, _verify_jwt_eddsa,
+    API_KEY_CACHE_TTL,
+)


-class TestAuthenticator:
-    """Test cases for Authenticator class"""
+# -- helpers ---------------------------------------------------------------

-    def test_authenticator_initialization_with_token(self):
-        """Test Authenticator initialization with valid token"""
-        auth = Authenticator(token="test-token-123")
-        
-        assert auth.token == "test-token-123"
-        assert auth.allow_all is False

-    def test_authenticator_initialization_with_allow_all(self):
-        """Test Authenticator initialization with allow_all=True"""
-        auth = Authenticator(allow_all=True)
-        
-        assert auth.token is None
-        assert auth.allow_all is True
+def _b64url(data: bytes) -> str:
+    return base64.urlsafe_b64encode(data).rstrip(b"=").decode("ascii")

-    def test_authenticator_initialization_without_token_raises_error(self):
-        """Test Authenticator initialization without token raises RuntimeError"""
-        with pytest.raises(RuntimeError, match="Need a token"):
-            Authenticator()

-    def test_authenticator_initialization_with_empty_token_raises_error(self):
-        """Test Authenticator initialization with empty token raises RuntimeError"""
-        with pytest.raises(RuntimeError, match="Need a token"):
-            Authenticator(token="")
+def make_keypair():
+    priv = ed25519.Ed25519PrivateKey.generate()
+    public_pem = priv.public_key().public_bytes(
+        encoding=serialization.Encoding.PEM,
+        format=serialization.PublicFormat.SubjectPublicKeyInfo,
+    ).decode("ascii")
+    return priv, public_pem

-    def test_permitted_with_allow_all_returns_true(self):
-        """Test permitted method returns True when allow_all is enabled"""
-        auth = Authenticator(allow_all=True)
-        
-        # Should return True regardless of token or roles
-        assert auth.permitted("any-token", []) is True
-        assert auth.permitted("different-token", ["admin"]) is True
-        assert auth.permitted(None, ["user"]) is True

-    def test_permitted_with_matching_token_returns_true(self):
-        """Test permitted method returns True with matching token"""
-        auth = Authenticator(token="secret-token")
-        
-        # Should return True when tokens match
-        assert auth.permitted("secret-token", []) is True
-        assert auth.permitted("secret-token", ["admin", "user"]) is True
+def sign_jwt(priv, claims, alg="EdDSA"):
+    header = {"alg": alg, "typ": "JWT", "kid": "kid-test"}
+    h = _b64url(json.dumps(header, separators=(",", ":"), sort_keys=True).encode())
+    p = _b64url(json.dumps(claims, separators=(",", ":"), sort_keys=True).encode())
+    signing_input = f"{h}.{p}".encode("ascii")
+    if alg == "EdDSA":
+        sig = priv.sign(signing_input)
+    else:
+        raise ValueError(f"test helper doesn't sign {alg}")
+    return f"{h}.{p}.{_b64url(sig)}"

-    def test_permitted_with_non_matching_token_returns_false(self):
-        """Test permitted method returns False with non-matching token"""
-        auth = Authenticator(token="secret-token")
-        
-        # Should return False when tokens don't match
-        assert auth.permitted("wrong-token", []) is False
-        assert auth.permitted("different-token", ["admin"]) is False
-        assert auth.permitted(None, ["user"]) is False

-    def test_permitted_with_token_and_allow_all_returns_true(self):
-        """Test permitted method with both token and allow_all set"""
-        auth = Authenticator(token="test-token", allow_all=True)
-        
-        # allow_all should take precedence
-        assert auth.permitted("any-token", []) is True
-        assert auth.permitted("wrong-token", ["admin"]) is True
+def make_request(auth_header):
+    """Minimal stand-in for an aiohttp request — IamAuth only reads
+    ``request.headers["Authorization"]``."""
+    req = Mock()
+    req.headers = {}
+    if auth_header is not None:
+        req.headers["Authorization"] = auth_header
+    return req
+
+
+# -- pure helpers ----------------------------------------------------------
+
+
+class TestB64UrlDecode:
+
+    def test_round_trip_without_padding(self):
+        data = b"hello"
+        encoded = _b64url(data)
+        assert _b64url_decode(encoded) == data
+
+    def test_handles_various_lengths(self):
+        for s in (b"a", b"ab", b"abc", b"abcd", b"abcde"):
+            assert _b64url_decode(_b64url(s)) == s
+
+
+# -- JWT verification -----------------------------------------------------
+
+
+class TestVerifyJwtEddsa:
+
+    def test_valid_jwt_passes(self):
+        priv, pub = make_keypair()
+        claims = {
+            "sub": "user-1", "workspace": "default",
+            "iat": int(time.time()),
+            "exp": int(time.time()) + 60,
+        }
+        token = sign_jwt(priv, claims)
+        got = _verify_jwt_eddsa(token, pub)
+        assert got["sub"] == "user-1"
+        assert got["workspace"] == "default"
+
+    def test_expired_jwt_rejected(self):
+        priv, pub = make_keypair()
+        claims = {
+            "sub": "user-1", "workspace": "default",
+            "iat": int(time.time()) - 3600,
+            "exp": int(time.time()) - 1,
+        }
+        token = sign_jwt(priv, claims)
+        with pytest.raises(ValueError, match="expired"):
+            _verify_jwt_eddsa(token, pub)
+
+    def test_bad_signature_rejected(self):
+        priv_a, _ = make_keypair()
+        _, pub_b = make_keypair()
+        claims = {
+            "sub": "user-1", "workspace": "default",
+            "iat": int(time.time()),
+            "exp": int(time.time()) + 60,
+        }
+        token = sign_jwt(priv_a, claims)
+        # pub_b never signed this token.
+        with pytest.raises(Exception):
+            _verify_jwt_eddsa(token, pub_b)
+
+    def test_malformed_jwt_rejected(self):
+        _, pub = make_keypair()
+        with pytest.raises(ValueError, match="malformed"):
+            _verify_jwt_eddsa("not-a-jwt", pub)
+
+    def test_unsupported_algorithm_rejected(self):
+        priv, pub = make_keypair()
+        # Manually build an "alg":"HS256" header — no signer needed
+        # since we expect it to bail before verifying.
+        header = {"alg": "HS256", "typ": "JWT", "kid": "x"}
+        payload = {
+            "sub": "user-1", "workspace": "default",
+            "iat": int(time.time()), "exp": int(time.time()) + 60,
+        }
+        h = _b64url(json.dumps(header, separators=(",", ":")).encode())
+        p = _b64url(json.dumps(payload, separators=(",", ":")).encode())
+        sig = _b64url(b"not-a-real-sig")
+        token = f"{h}.{p}.{sig}"
+        with pytest.raises(ValueError, match="unsupported alg"):
+            _verify_jwt_eddsa(token, pub)
+
+
+# -- Identity --------------------------------------------------------------
+
+
+class TestIdentity:
+
+    def test_fields(self):
+        i = Identity(
+            handle="u", workspace="w",
+            principal_id="u", source="api-key",
+        )
+        assert i.handle == "u"
+        assert i.workspace == "w"
+        assert i.principal_id == "u"
+        assert i.source == "api-key"
+
+
+# -- IamAuth.authenticate --------------------------------------------------
+
+
+class TestIamAuthDispatch:
+    """``authenticate()`` chooses between the JWT and API-key paths
+    by shape of the bearer."""
+
+    @pytest.mark.asyncio
+    async def test_no_authorization_header_raises_401(self):
+        auth = IamAuth(backend=Mock())
+        with pytest.raises(web.HTTPUnauthorized):
+            await auth.authenticate(make_request(None))
+
+    @pytest.mark.asyncio
+    async def test_non_bearer_header_raises_401(self):
+        auth = IamAuth(backend=Mock())
+        with pytest.raises(web.HTTPUnauthorized):
+            await auth.authenticate(make_request("Basic whatever"))
+
+    @pytest.mark.asyncio
+    async def test_empty_bearer_raises_401(self):
+        auth = IamAuth(backend=Mock())
+        with pytest.raises(web.HTTPUnauthorized):
+            await auth.authenticate(make_request("Bearer "))
+
+    @pytest.mark.asyncio
+    async def test_unknown_format_raises_401(self):
+        # Not tg_... and not dotted-JWT shape.
+        auth = IamAuth(backend=Mock())
+        with pytest.raises(web.HTTPUnauthorized):
+            await auth.authenticate(make_request("Bearer garbage"))
+
+    @pytest.mark.asyncio
+    async def test_valid_jwt_resolves_to_identity(self):
+        priv, pub = make_keypair()
+        claims = {
+            "sub": "user-1", "workspace": "default",
+            "iat": int(time.time()),
+            "exp": int(time.time()) + 60,
+        }
+        token = sign_jwt(priv, claims)
+
+        auth = IamAuth(backend=Mock())
+        auth._signing_public_pem = pub
+
+        ident = await auth.authenticate(
+            make_request(f"Bearer {token}")
+        )
+        assert ident.handle == "user-1"
+        assert ident.workspace == "default"
+        assert ident.principal_id == "user-1"
+        assert ident.source == "jwt"
+
+    @pytest.mark.asyncio
+    async def test_jwt_without_public_key_fails(self):
+        # If the gateway hasn't fetched IAM's public key yet, JWTs
+        # must not validate — even ones that would otherwise pass.
+        priv, _ = make_keypair()
+        claims = {
+            "sub": "user-1", "workspace": "default",
+            "iat": int(time.time()), "exp": int(time.time()) + 60,
+        }
+        token = sign_jwt(priv, claims)
+        auth = IamAuth(backend=Mock())
+        # _signing_public_pem defaults to None
+        with pytest.raises(web.HTTPUnauthorized):
+            await auth.authenticate(make_request(f"Bearer {token}"))
+
+    @pytest.mark.asyncio
+    async def test_api_key_path(self):
+        auth = IamAuth(backend=Mock())
+
+        async def fake_resolve(api_key):
+            assert api_key == "tg_testkey"
+            # Roles are returned by the regime as a hint but the
+            # gateway ignores them — kept here so the resolve
+            # protocol shape is exercised.
+            return ("user-xyz", "default", ["admin"])
+
+        async def fake_with_client(op):
+            return await op(Mock(resolve_api_key=fake_resolve))
+
+        with patch.object(auth, "_with_client", side_effect=fake_with_client):
+            ident = await auth.authenticate(
+                make_request("Bearer tg_testkey")
+            )
+        assert ident.handle == "user-xyz"
+        assert ident.workspace == "default"
+        assert ident.principal_id == "user-xyz"
+        assert ident.source == "api-key"
+
+    @pytest.mark.asyncio
+    async def test_api_key_rejection_masked_as_401(self):
+        auth = IamAuth(backend=Mock())
+
+        async def fake_with_client(op):
+            raise RuntimeError("auth-failed: unknown api key")
+
+        with patch.object(auth, "_with_client", side_effect=fake_with_client):
+            with pytest.raises(web.HTTPUnauthorized):
+                await auth.authenticate(
+                    make_request("Bearer tg_bogus")
+                )
+
+
+# -- API key cache ---------------------------------------------------------
+
+
+class TestApiKeyCache:
+
+    @pytest.mark.asyncio
+    async def test_cache_hit_skips_iam(self):
+        auth = IamAuth(backend=Mock())
+        calls = {"n": 0}
+
+        async def fake_with_client(op):
+            calls["n"] += 1
+            return await op(Mock(
+                resolve_api_key=AsyncMock(
+                    return_value=("u", "default", ["reader"]),
+                )
+            ))
+
+        with patch.object(auth, "_with_client", side_effect=fake_with_client):
+            await auth.authenticate(make_request("Bearer tg_k1"))
+            await auth.authenticate(make_request("Bearer tg_k1"))
+            await auth.authenticate(make_request("Bearer tg_k1"))
+
+        # Only the first lookup reaches IAM; the rest are cache hits.
+        assert calls["n"] == 1
+
+    @pytest.mark.asyncio
+    async def test_different_keys_are_separately_cached(self):
+        auth = IamAuth(backend=Mock())
+        seen = []
+
+        async def fake_with_client(op):
+            async def resolve(plaintext):
+                seen.append(plaintext)
+                return ("u-" + plaintext, "default", ["reader"])
+            return await op(Mock(resolve_api_key=resolve))
+
+        with patch.object(auth, "_with_client", side_effect=fake_with_client):
+            a = await auth.authenticate(make_request("Bearer tg_a"))
+            b = await auth.authenticate(make_request("Bearer tg_b"))
+
+        assert a.handle == "u-tg_a"
+        assert b.handle == "u-tg_b"
+        assert seen == ["tg_a", "tg_b"]
+
+    @pytest.mark.asyncio
+    async def test_cache_has_ttl_constant_set(self):
+        # Not a behaviour test — just ensures we don't accidentally
+        # set TTL to 0 (which would defeat the cache) or to a week.
+        assert 10 <= API_KEY_CACHE_TTL <= 3600
+
+
+# -- IamAuth.authorise -----------------------------------------------------
+
+
+class TestAuthorise:
+    """``authorise()`` is the gateway's only authorisation entry
+    point under the IAM contract.  It calls iam-svc, caches the
+    decision for the regime's TTL (clamped above), and raises 403
+    on deny / 401 on regime error (fail closed)."""
+
+    def _make_identity(self, handle="u-1", workspace="default"):
+        return Identity(
+            handle=handle, workspace=workspace,
+            principal_id=handle, source="api-key",
+        )
+
+    @pytest.mark.asyncio
+    async def test_allow_returns_no_exception(self):
+        auth = IamAuth(backend=Mock())
+
+        async def fake_with_client(op):
+            return await op(Mock(
+                authorise=AsyncMock(return_value=(True, 30)),
+            ))
+
+        with patch.object(auth, "_with_client", side_effect=fake_with_client):
+            await auth.authorise(
+                self._make_identity(),
+                "graph:read",
+                {"workspace": "default"},
+                {},
+            )
+
+    @pytest.mark.asyncio
+    async def test_deny_raises_403(self):
+        auth = IamAuth(backend=Mock())
+
+        async def fake_with_client(op):
+            return await op(Mock(
+                authorise=AsyncMock(return_value=(False, 30)),
+            ))
+
+        with patch.object(auth, "_with_client", side_effect=fake_with_client):
+            with pytest.raises(web.HTTPForbidden):
+                await auth.authorise(
+                    self._make_identity(),
+                    "users:admin",
+                    {},
+                    {"workspace": "acme"},
+                )
+
+    @pytest.mark.asyncio
+    async def test_regime_error_fails_closed_as_401(self):
+        # If iam-svc errors, the gateway must NOT silently allow.
+        auth = IamAuth(backend=Mock())
+
+        async def fake_with_client(op):
+            raise RuntimeError("iam-svc down")
+
+        with patch.object(auth, "_with_client", side_effect=fake_with_client):
+            with pytest.raises(web.HTTPUnauthorized):
+                await auth.authorise(
+                    self._make_identity(),
+                    "graph:read",
+                    {"workspace": "default"},
+                    {},
+                )
+
+    @pytest.mark.asyncio
+    async def test_allow_decision_is_cached(self):
+        auth = IamAuth(backend=Mock())
+        calls = {"n": 0}
+
+        async def fake_with_client(op):
+            calls["n"] += 1
+            return await op(Mock(
+                authorise=AsyncMock(return_value=(True, 30)),
+            ))
+
+        with patch.object(auth, "_with_client", side_effect=fake_with_client):
+            ident = self._make_identity()
+            for _ in range(5):
+                await auth.authorise(
+                    ident, "graph:read", {"workspace": "default"}, {},
+                )
+
+        assert calls["n"] == 1
+
+    @pytest.mark.asyncio
+    async def test_deny_decision_is_cached(self):
+        auth = IamAuth(backend=Mock())
+        calls = {"n": 0}
+
+        async def fake_with_client(op):
+            calls["n"] += 1
+            return await op(Mock(
+                authorise=AsyncMock(return_value=(False, 30)),
+            ))
+
+        with patch.object(auth, "_with_client", side_effect=fake_with_client):
+            ident = self._make_identity()
+            for _ in range(5):
+                with pytest.raises(web.HTTPForbidden):
+                    await auth.authorise(
+                        ident, "users:admin", {}, {"workspace": "acme"},
+                    )
+
+        # Denies are cached too — repeated attempts don't re-hit IAM.
+        assert calls["n"] == 1
+
+    @pytest.mark.asyncio
+    async def test_different_resources_cached_separately(self):
+        auth = IamAuth(backend=Mock())
+        calls = {"n": 0}
+
+        async def fake_with_client(op):
+            calls["n"] += 1
+            return await op(Mock(
+                authorise=AsyncMock(return_value=(True, 30)),
+            ))
+
+        with patch.object(auth, "_with_client", side_effect=fake_with_client):
+            ident = self._make_identity()
+            await auth.authorise(
+                ident, "graph:read", {"workspace": "a"}, {},
+            )
+            await auth.authorise(
+                ident, "graph:read", {"workspace": "b"}, {},
+            )
+
+        # Different resource → different cache key → two IAM calls.
+        assert calls["n"] == 2
--- a/tests/unit/test_gateway/test_capabilities.py
+++ b/tests/unit/test_gateway/test_capabilities.py
@ -0,0 +1,171 @@
+"""
+Tests for gateway/capabilities.py — the thin authorisation surface
+under the IAM contract.
+
+The gateway no longer holds policy state (roles, capability sets,
+workspace scopes); those live in iam-svc.  These tests cover only
+what the gateway shim does itself: PUBLIC / AUTHENTICATED short-
+circuiting, default-fill of workspace, and forwarding of capability
+checks to ``auth.authorise``.
+"""
+
+import pytest
+from aiohttp import web
+from unittest.mock import AsyncMock, MagicMock
+
+from trustgraph.gateway.capabilities import (
+    PUBLIC, AUTHENTICATED,
+    enforce, enforce_workspace,
+    access_denied, auth_failure,
+)
+
+
+# -- test fixtures ---------------------------------------------------------
+
+
+class _Identity:
+    """Stand-in for auth.Identity — under the IAM contract it has
+    just ``handle``, ``workspace``, ``principal_id``, ``source``."""
+
+    def __init__(self, handle="user-1", workspace="default"):
+        self.handle = handle
+        self.workspace = workspace
+        self.principal_id = handle
+        self.source = "api-key"
+
+
+def _allow_auth(identity=None):
+    """Build an Auth double that authenticates to ``identity`` and
+    allows every authorise() call."""
+    auth = MagicMock()
+    auth.authenticate = AsyncMock(
+        return_value=identity or _Identity(),
+    )
+    auth.authorise = AsyncMock(return_value=None)
+    return auth
+
+
+def _deny_auth(identity=None):
+    """Build an Auth double that authenticates but denies authorise."""
+    auth = MagicMock()
+    auth.authenticate = AsyncMock(
+        return_value=identity or _Identity(),
+    )
+    auth.authorise = AsyncMock(side_effect=access_denied())
+    return auth
+
+
+# -- enforce() -------------------------------------------------------------
+
+
+class TestEnforce:
+
+    @pytest.mark.asyncio
+    async def test_public_returns_none_no_auth(self):
+        auth = _allow_auth()
+        result = await enforce(MagicMock(), auth, PUBLIC)
+        assert result is None
+        auth.authenticate.assert_not_called()
+        auth.authorise.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_authenticated_skips_authorise(self):
+        identity = _Identity()
+        auth = _allow_auth(identity)
+        result = await enforce(MagicMock(), auth, AUTHENTICATED)
+        assert result is identity
+        auth.authenticate.assert_awaited_once()
+        auth.authorise.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_capability_calls_authorise_system_level(self):
+        identity = _Identity()
+        auth = _allow_auth(identity)
+        result = await enforce(MagicMock(), auth, "graph:read")
+        assert result is identity
+        auth.authorise.assert_awaited_once_with(
+            identity, "graph:read", {}, {},
+        )
+
+    @pytest.mark.asyncio
+    async def test_capability_denied_raises_forbidden(self):
+        auth = _deny_auth()
+        with pytest.raises(web.HTTPForbidden):
+            await enforce(MagicMock(), auth, "users:admin")
+
+
+# -- enforce_workspace() ---------------------------------------------------
+
+
+class TestEnforceWorkspace:
+
+    @pytest.mark.asyncio
+    async def test_default_fills_from_identity(self):
+        data = {"operation": "x"}
+        auth = _allow_auth()
+        await enforce_workspace(data, _Identity(workspace="default"), auth)
+        assert data["workspace"] == "default"
+
+    @pytest.mark.asyncio
+    async def test_caller_supplied_workspace_kept(self):
+        data = {"workspace": "acme", "operation": "x"}
+        auth = _allow_auth()
+        await enforce_workspace(data, _Identity(workspace="default"), auth)
+        assert data["workspace"] == "acme"
+
+    @pytest.mark.asyncio
+    async def test_no_capability_skips_authorise(self):
+        data = {"workspace": "default"}
+        auth = _allow_auth()
+        await enforce_workspace(data, _Identity(), auth)
+        auth.authorise.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_capability_calls_authorise_with_resource(self):
+        data = {"workspace": "acme"}
+        identity = _Identity()
+        auth = _allow_auth(identity)
+        await enforce_workspace(
+            data, identity, auth, capability="graph:read",
+        )
+        auth.authorise.assert_awaited_once_with(
+            identity, "graph:read", {"workspace": "acme"}, {},
+        )
+
+    @pytest.mark.asyncio
+    async def test_capability_denied_propagates(self):
+        data = {"workspace": "acme"}
+        auth = _deny_auth()
+        with pytest.raises(web.HTTPForbidden):
+            await enforce_workspace(
+                data, _Identity(), auth, capability="users:admin",
+            )
+
+    @pytest.mark.asyncio
+    async def test_non_dict_passthrough(self):
+        auth = _allow_auth()
+        result = await enforce_workspace("not-a-dict", _Identity(), auth)
+        assert result == "not-a-dict"
+        auth.authorise.assert_not_called()
+
+
+# -- helpers ---------------------------------------------------------------
+
+
+class TestResponseHelpers:
+
+    def test_auth_failure_is_401(self):
+        exc = auth_failure()
+        assert exc.status == 401
+        assert "auth failure" in exc.text
+
+    def test_access_denied_is_403(self):
+        exc = access_denied()
+        assert exc.status == 403
+        assert "access denied" in exc.text
+
+
+class TestSentinels:
+
+    def test_public_and_authenticated_are_distinct(self):
+        assert PUBLIC != AUTHENTICATED
--- a/tests/unit/test_gateway/test_dispatch_manager.py
+++ b/tests/unit/test_gateway/test_dispatch_manager.py
@ -42,7 +42,7 @@ class TestDispatcherManager:
        mock_backend = Mock()
        mock_config_receiver = Mock()
        
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        assert manager.backend == mock_backend
        assert manager.config_receiver == mock_config_receiver
@ -59,7 +59,10 @@ class TestDispatcherManager:
        mock_backend = Mock()
        mock_config_receiver = Mock()
        
-        manager = DispatcherManager(mock_backend, mock_config_receiver, prefix="custom-prefix")
+        manager = DispatcherManager(
+            mock_backend, mock_config_receiver,
+            auth=Mock(), prefix="custom-prefix",
+        )
        
        assert manager.prefix == "custom-prefix"

@ -68,7 +71,7 @@ class TestDispatcherManager:
        """Test start_flow method"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        flow_data = {"name": "test_flow", "steps": []}
        
@ -82,7 +85,7 @@ class TestDispatcherManager:
        """Test stop_flow method"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        # Pre-populate with a flow
        flow_data = {"name": "test_flow", "steps": []}
@ -96,7 +99,7 @@ class TestDispatcherManager:
        """Test dispatch_global_service returns DispatcherWrapper"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        wrapper = manager.dispatch_global_service()
        
@ -107,7 +110,7 @@ class TestDispatcherManager:
        """Test dispatch_core_export returns DispatcherWrapper"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        wrapper = manager.dispatch_core_export()
        
@ -118,7 +121,7 @@ class TestDispatcherManager:
        """Test dispatch_core_import returns DispatcherWrapper"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        wrapper = manager.dispatch_core_import()
        
@ -130,7 +133,7 @@ class TestDispatcherManager:
        """Test process_core_import method"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        with patch('trustgraph.gateway.dispatch.manager.CoreImport') as mock_core_import:
            mock_importer = Mock()
@ -148,7 +151,7 @@ class TestDispatcherManager:
        """Test process_core_export method"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        with patch('trustgraph.gateway.dispatch.manager.CoreExport') as mock_core_export:
            mock_exporter = Mock()
@ -166,7 +169,7 @@ class TestDispatcherManager:
        """Test process_global_service method"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        manager.invoke_global_service = AsyncMock(return_value="global_result")
        
@ -181,7 +184,7 @@ class TestDispatcherManager:
        """Test invoke_global_service with existing dispatcher"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        # Pre-populate with existing dispatcher
        mock_dispatcher = Mock()
@ -198,7 +201,7 @@ class TestDispatcherManager:
        """Test invoke_global_service creates new dispatcher"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        with patch('trustgraph.gateway.dispatch.manager.global_dispatchers') as mock_dispatchers:
            mock_dispatcher_class = Mock()
@ -230,7 +233,7 @@ class TestDispatcherManager:
        """Test dispatch_flow_import returns correct method"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        result = manager.dispatch_flow_import()
        
@ -240,7 +243,7 @@ class TestDispatcherManager:
        """Test dispatch_flow_export returns correct method"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        result = manager.dispatch_flow_export()
        
@ -250,7 +253,7 @@ class TestDispatcherManager:
        """Test dispatch_socket returns correct method"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        result = manager.dispatch_socket()
        
@ -260,7 +263,7 @@ class TestDispatcherManager:
        """Test dispatch_flow_service returns DispatcherWrapper"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        wrapper = manager.dispatch_flow_service()
        
@ -272,7 +275,7 @@ class TestDispatcherManager:
        """Test process_flow_import with valid flow and kind"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        # Setup test flow
        manager.flows[("default", "test_flow")] = {
@ -308,7 +311,7 @@ class TestDispatcherManager:
        """Test process_flow_import with invalid flow"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        params = {"flow": "invalid_flow", "kind": "triples"}
        
@ -323,7 +326,7 @@ class TestDispatcherManager:
            warnings.simplefilter("ignore", RuntimeWarning)
            mock_backend = Mock()
            mock_config_receiver = Mock()
-            manager = DispatcherManager(mock_backend, mock_config_receiver)
+            manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        # Setup test flow
        manager.flows[("default", "test_flow")] = {
@ -345,7 +348,7 @@ class TestDispatcherManager:
        """Test process_flow_export with valid flow and kind"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        # Setup test flow
        manager.flows[("default", "test_flow")] = {
@ -378,26 +381,47 @@ class TestDispatcherManager:

    @pytest.mark.asyncio
    async def test_process_socket(self):
-        """Test process_socket method"""
+        """process_socket constructs a Mux with the manager's auth
+        instance passed through — this is the gateway's trust path
+        for first-frame WebSocket authentication.  A Mux cannot be
+        built without auth (tested separately); this test pins that
+        the dispatcher-manager threads the correct auth value into
+        the Mux constructor call."""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
-        
+        mock_auth = Mock()
+        manager = DispatcherManager(
+            mock_backend, mock_config_receiver, auth=mock_auth,
+        )
+
        with patch('trustgraph.gateway.dispatch.manager.Mux') as mock_mux:
            mock_mux_instance = Mock()
            mock_mux.return_value = mock_mux_instance
-            
+
            result = await manager.process_socket("ws", "running", {})
-            
-            mock_mux.assert_called_once_with(manager, "ws", "running")
+
+            mock_mux.assert_called_once_with(
+                manager, "ws", "running", auth=mock_auth,
+            )
            assert result == mock_mux_instance

+    def test_dispatcher_manager_requires_auth(self):
+        """Constructing a DispatcherManager without an auth argument
+        must fail — a no-auth DispatcherManager would produce a
+        Mux without authentication, silently downgrading the socket
+        auth path."""
+        mock_backend = Mock()
+        mock_config_receiver = Mock()
+
+        with pytest.raises(ValueError, match="auth"):
+            DispatcherManager(mock_backend, mock_config_receiver, auth=None)
+
    @pytest.mark.asyncio
    async def test_process_flow_service(self):
        """Test process_flow_service method"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        manager.invoke_flow_service = AsyncMock(return_value="flow_result")
        
@ -412,7 +436,7 @@ class TestDispatcherManager:
        """Test invoke_flow_service with existing dispatcher"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        # Add flow to the flows dictionary
        manager.flows[("default", "test_flow")] = {"services": {"agent": {}}}
@ -432,7 +456,7 @@ class TestDispatcherManager:
        """Test invoke_flow_service creates request-response dispatcher"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        # Setup test flow
        manager.flows[("default", "test_flow")] = {
@ -476,7 +500,7 @@ class TestDispatcherManager:
        """Test invoke_flow_service creates sender dispatcher"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        # Setup test flow
        manager.flows[("default", "test_flow")] = {
@ -516,7 +540,7 @@ class TestDispatcherManager:
        """Test invoke_flow_service with invalid flow"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        with pytest.raises(RuntimeError, match="Invalid flow"):
            await manager.invoke_flow_service("data", "responder", "default", "invalid_flow", "agent")
@ -526,7 +550,7 @@ class TestDispatcherManager:
        """Test invoke_flow_service with kind not supported by flow"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
        
        # Setup test flow without agent interface
        manager.flows[("default", "test_flow")] = {
@ -543,7 +567,7 @@ class TestDispatcherManager:
        """Test invoke_flow_service with invalid kind"""
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())

        # Setup test flow with interface but unsupported kind
        manager.flows[("default", "test_flow")] = {
@ -570,7 +594,7 @@ class TestDispatcherManager:
        """
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())

        async def slow_start():
            # Yield to the event loop so other coroutines get a chance to run,
@ -606,7 +630,7 @@ class TestDispatcherManager:
        """
        mock_backend = Mock()
        mock_config_receiver = Mock()
-        manager = DispatcherManager(mock_backend, mock_config_receiver)
+        manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())

        manager.flows[("default", "test_flow")] = {
            "interfaces": {
--- a/tests/unit/test_gateway/test_dispatch_mux.py
+++ b/tests/unit/test_gateway/test_dispatch_mux.py
@ -12,6 +12,19 @@ from trustgraph.gateway.dispatch.mux import Mux, MAX_QUEUE_SIZE
 class TestMux:
    """Test cases for Mux class"""

+    def test_mux_requires_auth(self):
+        """Constructing a Mux without an ``auth`` argument must
+        fail.  The Mux implements the first-frame auth protocol and
+        there is no no-auth mode — a no-auth Mux would silently
+        accept every frame without authenticating it."""
+        with pytest.raises(ValueError, match="auth"):
+            Mux(
+                dispatcher_manager=MagicMock(),
+                ws=MagicMock(),
+                running=MagicMock(),
+                auth=None,
+            )
+
    def test_mux_initialization(self):
        """Test Mux initialization"""
        mock_dispatcher_manager = MagicMock()
@ -21,7 +34,8 @@ class TestMux:
        mux = Mux(
            dispatcher_manager=mock_dispatcher_manager,
            ws=mock_ws,
-            running=mock_running
+            running=mock_running,
+            auth=MagicMock(),
        )
        
        assert mux.dispatcher_manager == mock_dispatcher_manager
@ -40,7 +54,8 @@ class TestMux:
        mux = Mux(
            dispatcher_manager=mock_dispatcher_manager,
            ws=mock_ws,
-            running=mock_running
+            running=mock_running,
+            auth=MagicMock(),
        )
        
        # Call destroy
@ -61,7 +76,8 @@ class TestMux:
        mux = Mux(
            dispatcher_manager=mock_dispatcher_manager,
            ws=None,
-            running=mock_running
+            running=mock_running,
+            auth=MagicMock(),
        )
        
        # Call destroy
@ -81,7 +97,8 @@ class TestMux:
        mux = Mux(
            dispatcher_manager=mock_dispatcher_manager,
            ws=mock_ws,
-            running=mock_running
+            running=mock_running,
+            auth=MagicMock(),
        )
        
        # Mock message with valid JSON
@ -108,7 +125,8 @@ class TestMux:
        mux = Mux(
            dispatcher_manager=mock_dispatcher_manager,
            ws=mock_ws,
-            running=mock_running
+            running=mock_running,
+            auth=MagicMock(),
        )
        
        # Mock message without request field
@ -137,7 +155,8 @@ class TestMux:
        mux = Mux(
            dispatcher_manager=mock_dispatcher_manager,
            ws=mock_ws,
-            running=mock_running
+            running=mock_running,
+            auth=MagicMock(),
        )

        # Mock message without id field
@ -164,7 +183,8 @@ class TestMux:
        mux = Mux(
            dispatcher_manager=mock_dispatcher_manager,
            ws=mock_ws,
-            running=mock_running
+            running=mock_running,
+            auth=MagicMock(),
        )

        # Mock message with invalid JSON
--- a/tests/unit/test_gateway/test_endpoint_constant.py
+++ b/tests/unit/test_gateway/test_endpoint_constant.py
@ -13,29 +13,36 @@ class TestConstantEndpoint:
    """Test cases for ConstantEndpoint class"""

    def test_constant_endpoint_initialization(self):
-        """Test ConstantEndpoint initialization"""
+        """Construction records the configured capability on the
+        instance.  The capability is a required argument — no
+        permissive default — and the test passes an explicit
+        value to demonstrate the contract."""
        mock_auth = MagicMock()
        mock_dispatcher = MagicMock()
-        
+
        endpoint = ConstantEndpoint(
            endpoint_path="/api/test",
            auth=mock_auth,
-            dispatcher=mock_dispatcher
+            dispatcher=mock_dispatcher,
+            capability="config:read",
        )
-        
+
        assert endpoint.path == "/api/test"
        assert endpoint.auth == mock_auth
        assert endpoint.dispatcher == mock_dispatcher
-        assert endpoint.operation == "service"
+        assert endpoint.capability == "config:read"

    @pytest.mark.asyncio
    async def test_constant_endpoint_start_method(self):
        """Test ConstantEndpoint start method (should be no-op)"""
        mock_auth = MagicMock()
        mock_dispatcher = MagicMock()
-        
-        endpoint = ConstantEndpoint("/api/test", mock_auth, mock_dispatcher)
-        
+
+        endpoint = ConstantEndpoint(
+            "/api/test", mock_auth, mock_dispatcher,
+            capability="config:read",
+        )
+
        # start() should complete without error
        await endpoint.start()

@ -44,10 +51,13 @@ class TestConstantEndpoint:
        mock_auth = MagicMock()
        mock_dispatcher = MagicMock()
        mock_app = MagicMock()
-        
-        endpoint = ConstantEndpoint("/api/test", mock_auth, mock_dispatcher)
+
+        endpoint = ConstantEndpoint(
+            "/api/test", mock_auth, mock_dispatcher,
+            capability="config:read",
+        )
        endpoint.add_routes(mock_app)
-        
+
        # Verify add_routes was called with POST route
        mock_app.add_routes.assert_called_once()
        # The call should include web.post with the path and handler
--- a/tests/unit/test_gateway/test_endpoint_i18n.py
+++ b/tests/unit/test_gateway/test_endpoint_i18n.py
@ -1,4 +1,12 @@
-"""Tests for Gateway i18n pack endpoint."""
+"""Tests for Gateway i18n pack endpoint.
+
+Production registers this endpoint with ``capability=PUBLIC``: the
+login UI needs to render its own i18n strings before any user has
+authenticated, so the endpoint is deliberately pre-auth.  These
+tests exercise the PUBLIC configuration — that is the production
+contract.  Behaviour of authenticated endpoints is covered by the
+IamAuth tests in ``test_auth.py``.
+"""

 import json
 from unittest.mock import MagicMock
@ -7,6 +15,7 @@ import pytest
 from aiohttp import web

 from trustgraph.gateway.endpoint.i18n import I18nPackEndpoint
+from trustgraph.gateway.capabilities import PUBLIC


 class TestI18nPackEndpoint:
@ -17,23 +26,28 @@ class TestI18nPackEndpoint:
        endpoint = I18nPackEndpoint(
            endpoint_path="/api/v1/i18n/packs/{lang}",
            auth=mock_auth,
+            capability=PUBLIC,
        )

        assert endpoint.path == "/api/v1/i18n/packs/{lang}"
        assert endpoint.auth == mock_auth
-        assert endpoint.operation == "service"
+        assert endpoint.capability == PUBLIC

    @pytest.mark.asyncio
    async def test_i18n_endpoint_start_method(self):
        mock_auth = MagicMock()
-        endpoint = I18nPackEndpoint("/api/v1/i18n/packs/{lang}", mock_auth)
+        endpoint = I18nPackEndpoint(
+            "/api/v1/i18n/packs/{lang}", mock_auth, capability=PUBLIC,
+        )
        await endpoint.start()

    def test_add_routes_registers_get_handler(self):
        mock_auth = MagicMock()
        mock_app = MagicMock()

-        endpoint = I18nPackEndpoint("/api/v1/i18n/packs/{lang}", mock_auth)
+        endpoint = I18nPackEndpoint(
+            "/api/v1/i18n/packs/{lang}", mock_auth, capability=PUBLIC,
+        )
        endpoint.add_routes(mock_app)

        mock_app.add_routes.assert_called_once()
@ -41,35 +55,55 @@ class TestI18nPackEndpoint:
        assert len(call_args) == 1

    @pytest.mark.asyncio
-    async def test_handle_unauthorized_on_invalid_auth_scheme(self):
+    async def test_handle_returns_pack_without_authenticating(self):
+        """The PUBLIC endpoint serves the language pack without
+        invoking the auth handler at all — pre-login UI must be
+        reachable.  The test uses an auth mock that raises if
+        touched, so any auth attempt by the endpoint is caught."""
        mock_auth = MagicMock()
-        mock_auth.permitted.return_value = True

-        endpoint = I18nPackEndpoint("/api/v1/i18n/packs/{lang}", mock_auth)
+        def _should_not_be_called(*args, **kwargs):
+            raise AssertionError(
+                "PUBLIC endpoint must not invoke auth.authenticate"
+            )
+        mock_auth.authenticate = _should_not_be_called
+
+        endpoint = I18nPackEndpoint(
+            "/api/v1/i18n/packs/{lang}", mock_auth, capability=PUBLIC,
+        )

        request = MagicMock()
        request.path = "/api/v1/i18n/packs/en"
+        # A caller-supplied Authorization header of any form should
+        # be ignored — PUBLIC means we don't look at it.
        request.headers = {"Authorization": "Token abc"}
        request.match_info = {"lang": "en"}

-        resp = await endpoint.handle(request)
-        assert isinstance(resp, web.HTTPUnauthorized)
-
-    @pytest.mark.asyncio
-    async def test_handle_returns_pack_when_permitted(self):
-        mock_auth = MagicMock()
-        mock_auth.permitted.return_value = True
-
-        endpoint = I18nPackEndpoint("/api/v1/i18n/packs/{lang}", mock_auth)
-
-        request = MagicMock()
-        request.path = "/api/v1/i18n/packs/en"
-        request.headers = {}
-        request.match_info = {"lang": "en"}
-
        resp = await endpoint.handle(request)

        assert resp.status == 200
        payload = json.loads(resp.body.decode("utf-8"))
        assert isinstance(payload, dict)
        assert "cli.verify_system_status.title" in payload
+
+    @pytest.mark.asyncio
+    async def test_handle_rejects_path_traversal(self):
+        """The ``lang`` path parameter is reflected through to the
+        filesystem-backed pack loader.  The endpoint contains an
+        explicit defense against ``/`` and ``..`` in the value; this
+        test pins that defense in place."""
+        mock_auth = MagicMock()
+        endpoint = I18nPackEndpoint(
+            "/api/v1/i18n/packs/{lang}", mock_auth, capability=PUBLIC,
+        )
+
+        for bad in ("../../etc/passwd", "en/../fr", "a/b"):
+            request = MagicMock()
+            request.path = f"/api/v1/i18n/packs/{bad}"
+            request.headers = {}
+            request.match_info = {"lang": bad}
+
+            resp = await endpoint.handle(request)
+            assert isinstance(resp, web.HTTPBadRequest), (
+                f"path-traversal defense did not reject lang={bad!r}"
+            )
--- a/tests/unit/test_gateway/test_endpoint_manager.py
+++ b/tests/unit/test_gateway/test_endpoint_manager.py
@ -12,30 +12,24 @@ class TestEndpointManager:
    """Test cases for EndpointManager class"""

    def test_endpoint_manager_initialization(self):
-        """Test EndpointManager initialization creates all endpoints"""
+        """EndpointManager wires up the full endpoint set and
+        records dispatcher_manager / timeout on the instance."""
        mock_dispatcher_manager = MagicMock()
        mock_auth = MagicMock()
-        
-        # Mock dispatcher methods
-        mock_dispatcher_manager.dispatch_global_service.return_value = MagicMock()
-        mock_dispatcher_manager.dispatch_socket.return_value = MagicMock()
-        mock_dispatcher_manager.dispatch_flow_service.return_value = MagicMock()
-        mock_dispatcher_manager.dispatch_flow_import.return_value = MagicMock()
-        mock_dispatcher_manager.dispatch_flow_export.return_value = MagicMock()
-        mock_dispatcher_manager.dispatch_core_import.return_value = MagicMock()
-        mock_dispatcher_manager.dispatch_core_export.return_value = MagicMock()
-        
+
+        # The dispatcher_manager exposes a small set of factory
+        # methods — MagicMock auto-creates them, returning fresh
+        # MagicMocks on each call.
        manager = EndpointManager(
            dispatcher_manager=mock_dispatcher_manager,
            auth=mock_auth,
            prometheus_url="http://prometheus:9090",
-            timeout=300
+            timeout=300,
        )
-        
+
        assert manager.dispatcher_manager == mock_dispatcher_manager
        assert manager.timeout == 300
-        assert manager.services == {}
-        assert len(manager.endpoints) > 0  # Should have multiple endpoints
+        assert len(manager.endpoints) > 0

    def test_endpoint_manager_with_default_timeout(self):
        """Test EndpointManager with default timeout value"""
@ -79,9 +73,17 @@ class TestEndpointManager:
            prometheus_url="http://test:9090"
        )
        
-        # Verify all dispatcher methods were called during initialization
+        # Each dispatcher factory is invoked once per endpoint that
+        # needs a dedicated wire.  dispatch_auth_iam is shared by
+        # two endpoints — AuthEndpoints (login / bootstrap /
+        # change-password) and IamEndpoint (registry-driven
+        # /api/v1/iam) — so it's expected to be called twice.
+        # Both forwarders pin the dispatcher to kind=iam and reuse
+        # the same factory; they're distinct from
+        # dispatch_global_service (the generic /api/v1/{kind} route).
        mock_dispatcher_manager.dispatch_global_service.assert_called_once()
-        mock_dispatcher_manager.dispatch_socket.assert_called()  # Called twice
+        assert mock_dispatcher_manager.dispatch_auth_iam.call_count == 2
+        mock_dispatcher_manager.dispatch_socket.assert_called_once()
        mock_dispatcher_manager.dispatch_flow_service.assert_called_once()
        mock_dispatcher_manager.dispatch_flow_import.assert_called_once()
        mock_dispatcher_manager.dispatch_flow_export.assert_called_once()
--- a/tests/unit/test_gateway/test_endpoint_metrics.py
+++ b/tests/unit/test_gateway/test_endpoint_metrics.py
@ -12,31 +12,35 @@ class TestMetricsEndpoint:
    """Test cases for MetricsEndpoint class"""

    def test_metrics_endpoint_initialization(self):
-        """Test MetricsEndpoint initialization"""
+        """Construction records the configured capability on the
+        instance.  In production MetricsEndpoint is gated by
+        'metrics:read' so that's the natural value to pass."""
        mock_auth = MagicMock()
-        
+
        endpoint = MetricsEndpoint(
            prometheus_url="http://prometheus:9090",
            endpoint_path="/metrics",
-            auth=mock_auth
+            auth=mock_auth,
+            capability="metrics:read",
        )
-        
+
        assert endpoint.prometheus_url == "http://prometheus:9090"
        assert endpoint.path == "/metrics"
        assert endpoint.auth == mock_auth
-        assert endpoint.operation == "service"
+        assert endpoint.capability == "metrics:read"

    @pytest.mark.asyncio
    async def test_metrics_endpoint_start_method(self):
        """Test MetricsEndpoint start method (should be no-op)"""
        mock_auth = MagicMock()
-        
+
        endpoint = MetricsEndpoint(
            prometheus_url="http://localhost:9090",
            endpoint_path="/metrics",
-            auth=mock_auth
+            auth=mock_auth,
+            capability="metrics:read",
        )
-        
+
        # start() should complete without error
        await endpoint.start()

@ -44,15 +48,16 @@ class TestMetricsEndpoint:
        """Test add_routes method registers GET route with wildcard path"""
        mock_auth = MagicMock()
        mock_app = MagicMock()
-        
+
        endpoint = MetricsEndpoint(
            prometheus_url="http://prometheus:9090",
            endpoint_path="/metrics",
-            auth=mock_auth
+            auth=mock_auth,
+            capability="metrics:read",
        )
-        
+
        endpoint.add_routes(mock_app)
-        
+
        # Verify add_routes was called with GET route
        mock_app.add_routes.assert_called_once()
        # The call should include web.get with wildcard path pattern
--- a/tests/unit/test_gateway/test_endpoint_socket.py
+++ b/tests/unit/test_gateway/test_endpoint_socket.py
@ -1,5 +1,12 @@
 """
-Tests for Gateway Socket Endpoint
+Tests for Gateway Socket Endpoint.
+
+In production the only SocketEndpoint registered with HTTP-layer
+auth is ``/api/v1/socket`` using ``capability=AUTHENTICATED`` with
+``in_band_auth=True`` (first-frame auth over the websocket frames,
+not at the handshake).  The tests below use AUTHENTICATED as the
+representative capability; construction / worker / listener
+behaviour is independent of which capability is configured.
 """

 import pytest
@ -7,41 +14,47 @@ from unittest.mock import MagicMock, AsyncMock
 from aiohttp import WSMsgType

 from trustgraph.gateway.endpoint.socket import SocketEndpoint
+from trustgraph.gateway.capabilities import AUTHENTICATED


 class TestSocketEndpoint:
    """Test cases for SocketEndpoint class"""

    def test_socket_endpoint_initialization(self):
-        """Test SocketEndpoint initialization"""
+        """Construction records the configured capability on the
+        instance.  No permissive default is applied."""
        mock_auth = MagicMock()
        mock_dispatcher = MagicMock()
-        
+
        endpoint = SocketEndpoint(
            endpoint_path="/api/socket",
            auth=mock_auth,
-            dispatcher=mock_dispatcher
+            dispatcher=mock_dispatcher,
+            capability=AUTHENTICATED,
        )
-        
+
        assert endpoint.path == "/api/socket"
        assert endpoint.auth == mock_auth
        assert endpoint.dispatcher == mock_dispatcher
-        assert endpoint.operation == "socket"
+        assert endpoint.capability == AUTHENTICATED

    @pytest.mark.asyncio
    async def test_worker_method(self):
        """Test SocketEndpoint worker method"""
        mock_auth = MagicMock()
        mock_dispatcher = AsyncMock()
-        
-        endpoint = SocketEndpoint("/api/socket", mock_auth, mock_dispatcher)
-        
+
+        endpoint = SocketEndpoint(
+            "/api/socket", mock_auth, mock_dispatcher,
+            capability=AUTHENTICATED,
+        )
+
        mock_ws = MagicMock()
        mock_running = MagicMock()
-        
+
        # Call worker method
        await endpoint.worker(mock_ws, mock_dispatcher, mock_running)
-        
+
        # Verify dispatcher.run was called
        mock_dispatcher.run.assert_called_once()

@ -50,8 +63,11 @@ class TestSocketEndpoint:
        """Test SocketEndpoint listener method with text message"""
        mock_auth = MagicMock()
        mock_dispatcher = AsyncMock()
-        
-        endpoint = SocketEndpoint("/api/socket", mock_auth, mock_dispatcher)
+
+        endpoint = SocketEndpoint(
+            "/api/socket", mock_auth, mock_dispatcher,
+            capability=AUTHENTICATED,
+        )
        
        # Mock websocket with text message
        mock_msg = MagicMock()
@ -80,8 +96,11 @@ class TestSocketEndpoint:
        """Test SocketEndpoint listener method with binary message"""
        mock_auth = MagicMock()
        mock_dispatcher = AsyncMock()
-        
-        endpoint = SocketEndpoint("/api/socket", mock_auth, mock_dispatcher)
+
+        endpoint = SocketEndpoint(
+            "/api/socket", mock_auth, mock_dispatcher,
+            capability=AUTHENTICATED,
+        )
        
        # Mock websocket with binary message
        mock_msg = MagicMock()
@ -110,8 +129,11 @@ class TestSocketEndpoint:
        """Test SocketEndpoint listener method with close message"""
        mock_auth = MagicMock()
        mock_dispatcher = AsyncMock()
-        
-        endpoint = SocketEndpoint("/api/socket", mock_auth, mock_dispatcher)
+
+        endpoint = SocketEndpoint(
+            "/api/socket", mock_auth, mock_dispatcher,
+            capability=AUTHENTICATED,
+        )
        
        # Mock websocket with close message
        mock_msg = MagicMock()
--- a/tests/unit/test_gateway/test_endpoint_stream.py
+++ b/tests/unit/test_gateway/test_endpoint_stream.py
@ -12,48 +12,57 @@ class TestStreamEndpoint:
    """Test cases for StreamEndpoint class"""

    def test_stream_endpoint_initialization_with_post(self):
-        """Test StreamEndpoint initialization with POST method"""
+        """Construction records the configured capability on the
+        instance.  StreamEndpoint is used in production for the
+        core-import / core-export / document-stream routes; a
+        document-write capability is a realistic value for a POST
+        stream (e.g. core-import)."""
        mock_auth = MagicMock()
        mock_dispatcher = MagicMock()
-        
+
        endpoint = StreamEndpoint(
            endpoint_path="/api/stream",
            auth=mock_auth,
            dispatcher=mock_dispatcher,
-            method="POST"
+            capability="documents:write",
+            method="POST",
        )
-        
+
        assert endpoint.path == "/api/stream"
        assert endpoint.auth == mock_auth
        assert endpoint.dispatcher == mock_dispatcher
-        assert endpoint.operation == "service"
+        assert endpoint.capability == "documents:write"
        assert endpoint.method == "POST"

    def test_stream_endpoint_initialization_with_get(self):
-        """Test StreamEndpoint initialization with GET method"""
+        """GET stream — export-style endpoint, read capability."""
        mock_auth = MagicMock()
        mock_dispatcher = MagicMock()
-        
+
        endpoint = StreamEndpoint(
            endpoint_path="/api/stream",
            auth=mock_auth,
            dispatcher=mock_dispatcher,
-            method="GET"
+            capability="documents:read",
+            method="GET",
        )
-        
+
        assert endpoint.method == "GET"

    def test_stream_endpoint_initialization_default_method(self):
-        """Test StreamEndpoint initialization with default POST method"""
+        """Test StreamEndpoint initialization with default POST method.
+        The method default is cosmetic; the capability is not
+        defaulted — it is always required."""
        mock_auth = MagicMock()
        mock_dispatcher = MagicMock()
-        
+
        endpoint = StreamEndpoint(
            endpoint_path="/api/stream",
            auth=mock_auth,
-            dispatcher=mock_dispatcher
+            dispatcher=mock_dispatcher,
+            capability="documents:write",
        )
-        
+
        assert endpoint.method == "POST"  # Default value

    @pytest.mark.asyncio
@ -61,9 +70,12 @@ class TestStreamEndpoint:
        """Test StreamEndpoint start method (should be no-op)"""
        mock_auth = MagicMock()
        mock_dispatcher = MagicMock()
-        
-        endpoint = StreamEndpoint("/api/stream", mock_auth, mock_dispatcher)
-        
+
+        endpoint = StreamEndpoint(
+            "/api/stream", mock_auth, mock_dispatcher,
+            capability="documents:write",
+        )
+
        # start() should complete without error
        await endpoint.start()

@ -72,16 +84,17 @@ class TestStreamEndpoint:
        mock_auth = MagicMock()
        mock_dispatcher = MagicMock()
        mock_app = MagicMock()
-        
+
        endpoint = StreamEndpoint(
            endpoint_path="/api/stream",
            auth=mock_auth,
            dispatcher=mock_dispatcher,
-            method="POST"
+            capability="documents:write",
+            method="POST",
        )
-        
+
        endpoint.add_routes(mock_app)
-        
+
        # Verify add_routes was called with POST route
        mock_app.add_routes.assert_called_once()
        call_args = mock_app.add_routes.call_args[0][0]
@ -92,16 +105,17 @@ class TestStreamEndpoint:
        mock_auth = MagicMock()
        mock_dispatcher = MagicMock()
        mock_app = MagicMock()
-        
+
        endpoint = StreamEndpoint(
            endpoint_path="/api/stream",
            auth=mock_auth,
            dispatcher=mock_dispatcher,
-            method="GET"
+            capability="documents:read",
+            method="GET",
        )
-        
+
        endpoint.add_routes(mock_app)
-        
+
        # Verify add_routes was called with GET route
        mock_app.add_routes.assert_called_once()
        call_args = mock_app.add_routes.call_args[0][0]
@ -112,13 +126,14 @@ class TestStreamEndpoint:
        mock_auth = MagicMock()
        mock_dispatcher = MagicMock()
        mock_app = MagicMock()
-        
+
        endpoint = StreamEndpoint(
            endpoint_path="/api/stream",
            auth=mock_auth,
            dispatcher=mock_dispatcher,
-            method="INVALID"
+            capability="documents:write",
+            method="INVALID",
        )
-        
+
        with pytest.raises(RuntimeError, match="Bad method"):
            endpoint.add_routes(mock_app)
--- a/tests/unit/test_gateway/test_endpoint_variable.py
+++ b/tests/unit/test_gateway/test_endpoint_variable.py
@ -12,29 +12,36 @@ class TestVariableEndpoint:
    """Test cases for VariableEndpoint class"""

    def test_variable_endpoint_initialization(self):
-        """Test VariableEndpoint initialization"""
+        """Construction records the configured capability on the
+        instance.  VariableEndpoint is used in production for the
+        /api/v1/{kind} admin-scoped global service routes, so a
+        write-side capability is a realistic value for the test."""
        mock_auth = MagicMock()
        mock_dispatcher = MagicMock()
-        
+
        endpoint = VariableEndpoint(
            endpoint_path="/api/variable",
            auth=mock_auth,
-            dispatcher=mock_dispatcher
+            dispatcher=mock_dispatcher,
+            capability="config:write",
        )
-        
+
        assert endpoint.path == "/api/variable"
        assert endpoint.auth == mock_auth
        assert endpoint.dispatcher == mock_dispatcher
-        assert endpoint.operation == "service"
+        assert endpoint.capability == "config:write"

    @pytest.mark.asyncio
    async def test_variable_endpoint_start_method(self):
        """Test VariableEndpoint start method (should be no-op)"""
        mock_auth = MagicMock()
        mock_dispatcher = MagicMock()
-        
-        endpoint = VariableEndpoint("/api/var", mock_auth, mock_dispatcher)
-        
+
+        endpoint = VariableEndpoint(
+            "/api/var", mock_auth, mock_dispatcher,
+            capability="config:write",
+        )
+
        # start() should complete without error
        await endpoint.start()

@ -43,10 +50,13 @@ class TestVariableEndpoint:
        mock_auth = MagicMock()
        mock_dispatcher = MagicMock()
        mock_app = MagicMock()
-        
-        endpoint = VariableEndpoint("/api/variable", mock_auth, mock_dispatcher)
+
+        endpoint = VariableEndpoint(
+            "/api/variable", mock_auth, mock_dispatcher,
+            capability="config:write",
+        )
        endpoint.add_routes(mock_app)
-        
+
        # Verify add_routes was called with POST route
        mock_app.add_routes.assert_called_once()
        call_args = mock_app.add_routes.call_args[0][0]
--- a/tests/unit/test_gateway/test_service.py
+++ b/tests/unit/test_gateway/test_service.py
@ -1,355 +1,179 @@
 """
-Tests for Gateway Service API
+Tests for gateway/service.py — the Api class that wires together
+the pub/sub backend, IAM auth, config receiver, dispatcher manager,
+and endpoint manager.
+
+The legacy ``GATEWAY_SECRET`` / ``default_api_token`` / allow-all
+surface is gone, so the tests here focus on the Api's construction
+and composition rather than the removed auth behaviour.  IamAuth's
+own behaviour is covered in test_auth.py.
 """

 import pytest
-import asyncio
-from unittest.mock import Mock, patch, MagicMock, AsyncMock
+from unittest.mock import AsyncMock, Mock, patch
 from aiohttp import web
-import pulsar

-from trustgraph.gateway.service import Api, run, default_pulsar_host, default_prometheus_url, default_timeout, default_port, default_api_token
-
-# Tests for Gateway Service API
+from trustgraph.gateway.service import (
+    Api,
+    default_pulsar_host, default_prometheus_url,
+    default_timeout, default_port,
+)
+from trustgraph.gateway.auth import IamAuth


-class TestApi:
-    """Test cases for Api class"""
-    
+# -- constants -------------------------------------------------------------

-    def test_api_initialization_with_defaults(self):
-        """Test Api initialization with default values"""
-        with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
-            mock_backend = Mock()
-            mock_get_pubsub.return_value = mock_backend

-            api = Api()
+class TestDefaults:

-            assert api.port == default_port
-            assert api.timeout == default_timeout
-            assert api.pulsar_host == default_pulsar_host
-            assert api.pulsar_api_key is None
-            assert api.prometheus_url == default_prometheus_url + "/"
-            assert api.auth.allow_all is True
+    def test_exports_default_constants(self):
+        # These are consumed by CLIs / tests / docs.  Sanity-check
+        # that they're the expected shape.
+        assert default_port == 8088
+        assert default_timeout == 600
+        assert default_pulsar_host.startswith("pulsar://")
+        assert default_prometheus_url.startswith("http")

-            # Verify get_pubsub was called
-            mock_get_pubsub.assert_called_once()

-    def test_api_initialization_with_custom_config(self):
-        """Test Api initialization with custom configuration"""
+# -- Api construction ------------------------------------------------------
+
+
+@pytest.fixture
+def mock_backend():
+    return Mock()
+
+
+@pytest.fixture
+def api(mock_backend):
+    with patch(
+        "trustgraph.gateway.service.get_pubsub",
+        return_value=mock_backend,
+    ):
+        yield Api()
+
+
+class TestApiConstruction:
+
+    def test_defaults(self, api):
+        assert api.port == default_port
+        assert api.timeout == default_timeout
+        assert api.pulsar_host == default_pulsar_host
+        assert api.pulsar_api_key is None
+        # prometheus_url gets normalised with a trailing slash
+        assert api.prometheus_url == default_prometheus_url + "/"
+
+    def test_auth_is_iam_backed(self, api):
+        # Any Api always gets an IamAuth.  There is no "no auth" mode
+        # (GATEWAY_SECRET / allow_all has been removed — see IAM spec).
+        assert isinstance(api.auth, IamAuth)
+
+    def test_components_wired(self, api):
+        assert api.config_receiver is not None
+        assert api.dispatcher_manager is not None
+        assert api.endpoint_manager is not None
+
+    def test_dispatcher_manager_has_auth(self, api):
+        # The Mux uses this handle for first-frame socket auth.
+        assert api.dispatcher_manager.auth is api.auth
+
+    def test_custom_config(self, mock_backend):
        config = {
            "port": 9000,
            "timeout": 300,
            "pulsar_host": "pulsar://custom-host:6650",
-            "pulsar_api_key": "test-api-key",
-            "pulsar_listener": "custom-listener",
+            "pulsar_api_key": "custom-key",
            "prometheus_url": "http://custom-prometheus:9090",
-            "api_token": "secret-token"
        }
+        with patch(
+            "trustgraph.gateway.service.get_pubsub",
+            return_value=mock_backend,
+        ):
+            a = Api(**config)

-        with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
-            mock_backend = Mock()
-            mock_get_pubsub.return_value = mock_backend
+        assert a.port == 9000
+        assert a.timeout == 300
+        assert a.pulsar_host == "pulsar://custom-host:6650"
+        assert a.pulsar_api_key == "custom-key"
+        # Trailing slash added.
+        assert a.prometheus_url == "http://custom-prometheus:9090/"

-            api = Api(**config)
+    def test_prometheus_url_already_has_trailing_slash(self, mock_backend):
+        with patch(
+            "trustgraph.gateway.service.get_pubsub",
+            return_value=mock_backend,
+        ):
+            a = Api(prometheus_url="http://p:9090/")
+        assert a.prometheus_url == "http://p:9090/"

-            assert api.port == 9000
-            assert api.timeout == 300
-            assert api.pulsar_host == "pulsar://custom-host:6650"
-            assert api.pulsar_api_key == "test-api-key"
-            assert api.prometheus_url == "http://custom-prometheus:9090/"
-            assert api.auth.token == "secret-token"
-            assert api.auth.allow_all is False
+    def test_queue_overrides_parsed_for_config(self, mock_backend):
+        with patch(
+            "trustgraph.gateway.service.get_pubsub",
+            return_value=mock_backend,
+        ):
+            a = Api(
+                config_request_queue="alt-config-req",
+                config_response_queue="alt-config-resp",
+            )
+        overrides = a.dispatcher_manager.queue_overrides
+        assert overrides.get("config", {}).get("request") == "alt-config-req"
+        assert overrides.get("config", {}).get("response") == "alt-config-resp"

-            # Verify get_pubsub was called with config
-            mock_get_pubsub.assert_called_once_with(**config)

-    def test_api_initialization_with_pulsar_api_key(self):
-        """Test Api initialization with Pulsar API key authentication"""
-        with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
-            mock_get_pubsub.return_value = Mock()
+# -- app_factory -----------------------------------------------------------

-            api = Api(pulsar_api_key="test-key")

-            # Verify api key was stored
-            assert api.pulsar_api_key == "test-key"
-            mock_get_pubsub.assert_called_once()
-
-    def test_api_initialization_prometheus_url_normalization(self):
-        """Test that prometheus_url gets normalized with trailing slash"""
-        with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
-            mock_get_pubsub.return_value = Mock()
-            
-            # Test URL without trailing slash
-            api = Api(prometheus_url="http://prometheus:9090")
-            assert api.prometheus_url == "http://prometheus:9090/"
-            
-            # Test URL with trailing slash
-            api = Api(prometheus_url="http://prometheus:9090/")
-            assert api.prometheus_url == "http://prometheus:9090/"
-
-    def test_api_initialization_empty_api_token_means_no_auth(self):
-        """Test that empty API token results in allow_all authentication"""
-        with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
-            mock_get_pubsub.return_value = Mock()
-            
-            api = Api(api_token="")
-            assert api.auth.allow_all is True
-
-    def test_api_initialization_none_api_token_means_no_auth(self):
-        """Test that None API token results in allow_all authentication"""
-        with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
-            mock_get_pubsub.return_value = Mock()
-            
-            api = Api(api_token=None)
-            assert api.auth.allow_all is True
+class TestAppFactory:

    @pytest.mark.asyncio
-    async def test_app_factory_creates_application(self):
-        """Test that app_factory creates aiohttp application"""
-        with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
-            mock_get_pubsub.return_value = Mock()
-            
-            api = Api()
-            
-            # Mock the dependencies
-            api.config_receiver = Mock()
-            api.config_receiver.start = AsyncMock()
-            api.endpoint_manager = Mock()
-            api.endpoint_manager.add_routes = Mock()
-            api.endpoint_manager.start = AsyncMock()
-            
-            app = await api.app_factory()
-            
-            assert isinstance(app, web.Application)
-            assert app._client_max_size == 256 * 1024 * 1024
-            
-            # Verify that config receiver was started
-            api.config_receiver.start.assert_called_once()
-            
-            # Verify that endpoint manager was configured
-            api.endpoint_manager.add_routes.assert_called_once_with(app)
-            api.endpoint_manager.start.assert_called_once()
+    async def test_creates_aiohttp_app(self, api):
+        # Stub out the long-tail dependencies that reach out to IAM /
+        # pub/sub so we can exercise the factory in isolation.
+        api.auth.start = AsyncMock()
+        api.config_receiver = Mock()
+        api.config_receiver.start = AsyncMock()
+        api.endpoint_manager = Mock()
+        api.endpoint_manager.add_routes = Mock()
+        api.endpoint_manager.start = AsyncMock()
+        api.endpoints = []
+
+        app = await api.app_factory()
+
+        assert isinstance(app, web.Application)
+        assert app._client_max_size == 256 * 1024 * 1024
+        api.auth.start.assert_called_once()
+        api.config_receiver.start.assert_called_once()
+        api.endpoint_manager.add_routes.assert_called_once_with(app)
+        api.endpoint_manager.start.assert_called_once()

    @pytest.mark.asyncio
-    async def test_app_factory_with_custom_endpoints(self):
-        """Test app_factory with custom endpoints"""
-        with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
-            mock_get_pubsub.return_value = Mock()
-            
-            api = Api()
-            
-            # Mock custom endpoints
-            mock_endpoint1 = Mock()
-            mock_endpoint1.add_routes = Mock()
-            mock_endpoint1.start = AsyncMock()
-            
-            mock_endpoint2 = Mock()
-            mock_endpoint2.add_routes = Mock()
-            mock_endpoint2.start = AsyncMock()
-            
-            api.endpoints = [mock_endpoint1, mock_endpoint2]
-            
-            # Mock the dependencies
-            api.config_receiver = Mock()
-            api.config_receiver.start = AsyncMock()
-            api.endpoint_manager = Mock()
-            api.endpoint_manager.add_routes = Mock()
-            api.endpoint_manager.start = AsyncMock()
-            
-            app = await api.app_factory()
-            
-            # Verify custom endpoints were configured
-            mock_endpoint1.add_routes.assert_called_once_with(app)
-            mock_endpoint1.start.assert_called_once()
-            mock_endpoint2.add_routes.assert_called_once_with(app)
-            mock_endpoint2.start.assert_called_once()
+    async def test_auth_start_runs_before_accepting_traffic(self, api):
+        """``auth.start()`` fetches the IAM signing key, and must
+        complete (or time out) before the gateway begins accepting
+        requests.  It's the first await in app_factory."""
+        order = []

-    def test_run_method_calls_web_run_app(self):
-        """Test that run method calls web.run_app"""
-        with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub, \
-             patch('aiohttp.web.run_app') as mock_run_app:
-            mock_get_pubsub.return_value = Mock()
+        # AsyncMock.side_effect expects a sync callable (its return
+        # value becomes the coroutine's return); a plain list.append
+        # avoids the "coroutine was never awaited" trap of an async
+        # side_effect.
+        api.auth.start = AsyncMock(
+            side_effect=lambda: order.append("auth"),
+        )
+        api.config_receiver = Mock()
+        api.config_receiver.start = AsyncMock(
+            side_effect=lambda: order.append("config"),
+        )
+        api.endpoint_manager = Mock()
+        api.endpoint_manager.add_routes = Mock()
+        api.endpoint_manager.start = AsyncMock(
+            side_effect=lambda: order.append("endpoints"),
+        )
+        api.endpoints = []

-            # Api.run() passes self.app_factory() — a coroutine — to
-            # web.run_app, which would normally consume it inside its own
-            # event loop. Since we mock run_app, close the coroutine here
-            # so it doesn't leak as an "unawaited coroutine" RuntimeWarning.
-            def _consume_coro(coro, **kwargs):
-                coro.close()
-            mock_run_app.side_effect = _consume_coro
+        await api.app_factory()

-            api = Api(port=8080)
-            api.run()
-
-            # Verify run_app was called once with the correct port
-            mock_run_app.assert_called_once()
-            args, kwargs = mock_run_app.call_args
-            assert len(args) == 1  # Should have one positional arg (the coroutine)
-            assert kwargs == {'port': 8080}  # Should have port keyword arg
-
-    def test_api_components_initialization(self):
-        """Test that all API components are properly initialized"""
-        with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
-            mock_get_pubsub.return_value = Mock()
-
-            api = Api()
-
-            # Verify all components are initialized
-            assert api.config_receiver is not None
-            assert api.dispatcher_manager is not None
-            assert api.endpoint_manager is not None
-            assert api.endpoints == []
-
-            # Verify component relationships
-            assert api.dispatcher_manager.backend == api.pubsub_backend
-            assert api.dispatcher_manager.config_receiver == api.config_receiver
-            assert api.endpoint_manager.dispatcher_manager == api.dispatcher_manager
-            # EndpointManager doesn't store auth directly, it passes it to individual endpoints
-
-
-class TestRunFunction:
-    """Test cases for the run() function"""
-
-    def test_run_function_with_metrics_enabled(self):
-        """Test run function with metrics enabled"""
-        import warnings
-        # Suppress the specific async warning with a broader pattern
-        warnings.filterwarnings("ignore", message=".*Api.app_factory.*was never awaited", category=RuntimeWarning)
-        
-        with patch('argparse.ArgumentParser.parse_args') as mock_parse_args, \
-             patch('trustgraph.gateway.service.start_http_server') as mock_start_http_server:
-            
-            # Mock command line arguments
-            mock_args = Mock()
-            mock_args.metrics = True
-            mock_args.metrics_port = 8000
-            mock_parse_args.return_value = mock_args
-            
-            # Create a simple mock instance without any async methods
-            mock_api_instance = Mock()
-            mock_api_instance.run = Mock()
-            
-            # Create a mock Api class without importing the real one
-            mock_api = Mock(return_value=mock_api_instance)
-            
-            # Patch using context manager to avoid importing the real Api class
-            with patch('trustgraph.gateway.service.Api', mock_api):
-                # Mock vars() to return a dict
-                with patch('builtins.vars') as mock_vars:
-                    mock_vars.return_value = {
-                        'metrics': True,
-                        'metrics_port': 8000,
-                        'pulsar_host': default_pulsar_host,
-                        'timeout': default_timeout
-                    }
-                    
-                    run()
-                    
-                    # Verify metrics server was started
-                    mock_start_http_server.assert_called_once_with(8000)
-                    
-                    # Verify Api was created and run was called
-                    mock_api.assert_called_once()
-                    mock_api_instance.run.assert_called_once()
-
-    @patch('trustgraph.gateway.service.start_http_server')
-    @patch('argparse.ArgumentParser.parse_args')
-    def test_run_function_with_metrics_disabled(self, mock_parse_args, mock_start_http_server):
-        """Test run function with metrics disabled"""
-        # Mock command line arguments
-        mock_args = Mock()
-        mock_args.metrics = False
-        mock_parse_args.return_value = mock_args
-        
-        # Create a simple mock instance without any async methods
-        mock_api_instance = Mock()
-        mock_api_instance.run = Mock()
-        
-        # Patch the Api class inside the test without using decorators
-        with patch('trustgraph.gateway.service.Api') as mock_api:
-            mock_api.return_value = mock_api_instance
-            
-            # Mock vars() to return a dict
-            with patch('builtins.vars') as mock_vars:
-                mock_vars.return_value = {
-                    'metrics': False,
-                    'metrics_port': 8000,
-                    'pulsar_host': default_pulsar_host,
-                    'timeout': default_timeout
-                }
-                
-                run()
-                
-                # Verify metrics server was NOT started
-                mock_start_http_server.assert_not_called()
-                
-                # Verify Api was created and run was called
-                mock_api.assert_called_once()
-                mock_api_instance.run.assert_called_once()
-
-    @patch('argparse.ArgumentParser.parse_args')
-    def test_run_function_argument_parsing(self, mock_parse_args):
-        """Test that run function properly parses command line arguments"""
-        # Mock command line arguments
-        mock_args = Mock()
-        mock_args.metrics = False
-        mock_parse_args.return_value = mock_args
-        
-        # Create a simple mock instance without any async methods
-        mock_api_instance = Mock()
-        mock_api_instance.run = Mock()
-        
-        # Mock vars() to return a dict with all expected arguments
-        expected_args = {
-            'pulsar_host': 'pulsar://test:6650',
-            'pulsar_api_key': 'test-key',
-            'pulsar_listener': 'test-listener',
-            'prometheus_url': 'http://test-prometheus:9090',
-            'port': 9000,
-            'timeout': 300,
-            'api_token': 'secret',
-            'log_level': 'INFO',
-            'metrics': False,
-            'metrics_port': 8001
-        }
-        
-        # Patch the Api class inside the test without using decorators
-        with patch('trustgraph.gateway.service.Api') as mock_api:
-            mock_api.return_value = mock_api_instance
-            
-            with patch('builtins.vars') as mock_vars:
-                mock_vars.return_value = expected_args
-                
-                run()
-                
-                # Verify Api was created with the parsed arguments
-                mock_api.assert_called_once_with(**expected_args)
-                mock_api_instance.run.assert_called_once()
-
-    def test_run_function_creates_argument_parser(self):
-        """Test that run function creates argument parser with correct arguments"""
-        with patch('argparse.ArgumentParser') as mock_parser_class:
-            mock_parser = Mock()
-            mock_parser_class.return_value = mock_parser
-            mock_parser.parse_args.return_value = Mock(metrics=False)
-            
-            with patch('trustgraph.gateway.service.Api') as mock_api, \
-                 patch('builtins.vars') as mock_vars:
-                mock_vars.return_value = {'metrics': False}
-                mock_api.return_value = Mock()
-                
-                run()
-                
-                # Verify ArgumentParser was created
-                mock_parser_class.assert_called_once()
-                
-                # Verify add_argument was called for each expected argument
-                expected_arguments = [
-                    'pulsar-host', 'pulsar-api-key', 'pulsar-listener',
-                    'prometheus-url', 'port', 'timeout', 'api-token',
-                    'log-level', 'metrics', 'metrics-port'
-                ]
-                
-                # Check that add_argument was called multiple times (once for each arg)
-                assert mock_parser.add_argument.call_count >= len(expected_arguments)
+        # auth.start must be first (before config receiver, before
+        # any endpoint starts).
+        assert order[0] == "auth"
+        # All three must have run.
+        assert set(order) == {"auth", "config", "endpoints"}
--- a/tests/unit/test_gateway/test_socket_graceful_shutdown.py
+++ b/tests/unit/test_gateway/test_socket_graceful_shutdown.py
@ -1,4 +1,15 @@
-"""Unit tests for SocketEndpoint graceful shutdown functionality."""
+"""Unit tests for SocketEndpoint graceful shutdown functionality.
+
+These tests exercise SocketEndpoint in its handshake-auth
+configuration (``in_band_auth=False``) — the mode used in production
+for the flow import/export streaming endpoints.  The mux socket at
+``/api/v1/socket`` uses ``in_band_auth=True`` instead, where the
+handshake always accepts and authentication runs on the first
+WebSocket frame; that path is covered by the Mux tests.
+
+Every endpoint constructor here passes an explicit capability — no
+permissive default is relied upon.
+"""

 import pytest
 import asyncio
@ -6,13 +17,32 @@ from unittest.mock import AsyncMock, MagicMock, patch
 from aiohttp import web, WSMsgType
 from trustgraph.gateway.endpoint.socket import SocketEndpoint
 from trustgraph.gateway.running import Running
+from trustgraph.gateway.auth import Identity
+
+
+# Representative capability used across these tests — corresponds to
+# the flow-import streaming endpoint pattern that uses this class.
+TEST_CAP = "graph:write"
+
+
+def _valid_identity():
+    return Identity(
+        handle="test-user",
+        workspace="default",
+        principal_id="test-user",
+        source="api-key",
+    )


@pytest.fixture
 def mock_auth():
-    """Mock authentication service."""
+    """Mock IAM-backed authenticator.  Successful by default —
+    ``authenticate`` returns a valid identity and ``authorise``
+    allows everything.  Tests that need the failure paths override
+    the relevant attribute locally."""
    auth = MagicMock()
-    auth.permitted.return_value = True
+    auth.authenticate = AsyncMock(return_value=_valid_identity())
+    auth.authorise = AsyncMock(return_value=None)
    return auth


@ -25,7 +55,7 @@ def mock_dispatcher_factory():
        dispatcher.receive = AsyncMock()
        dispatcher.destroy = AsyncMock()
        return dispatcher
-    
+
    return dispatcher_factory


@ -35,7 +65,8 @@ def socket_endpoint(mock_auth, mock_dispatcher_factory):
    return SocketEndpoint(
        endpoint_path="/test-socket",
        auth=mock_auth,
-        dispatcher=mock_dispatcher_factory
+        dispatcher=mock_dispatcher_factory,
+        capability=TEST_CAP,
    )


@ -61,7 +92,10 @@ def mock_request():
@pytest.mark.asyncio
 async def test_listener_graceful_shutdown_on_close():
    """Test listener handles websocket close gracefully."""
-    socket_endpoint = SocketEndpoint("/test", MagicMock(), AsyncMock())
+    socket_endpoint = SocketEndpoint(
+        "/test", MagicMock(), AsyncMock(),
+        capability=TEST_CAP,
+    )
    
    # Mock websocket that closes after one message
    ws = AsyncMock()
@ -99,9 +133,10 @@ async def test_listener_graceful_shutdown_on_close():

@pytest.mark.asyncio
 async def test_handle_normal_flow():
-    """Test normal websocket handling flow."""
+    """Valid bearer → handshake accepted, dispatcher created."""
    mock_auth = MagicMock()
-    mock_auth.permitted.return_value = True
+    mock_auth.authenticate = AsyncMock(return_value=_valid_identity())
+    mock_auth.authorise = AsyncMock(return_value=None)

    dispatcher_created = False
    async def mock_dispatcher_factory(ws, running, match_info):
@ -111,7 +146,10 @@ async def test_handle_normal_flow():
        dispatcher.destroy = AsyncMock()
        return dispatcher

-    socket_endpoint = SocketEndpoint("/test", mock_auth, mock_dispatcher_factory)
+    socket_endpoint = SocketEndpoint(
+        "/test", mock_auth, mock_dispatcher_factory,
+        capability=TEST_CAP,
+    )

    request = MagicMock()
    request.query = {"token": "valid-token"}
@ -155,7 +193,8 @@ async def test_handle_normal_flow():
 async def test_handle_exception_group_cleanup():
    """Test exception group triggers dispatcher cleanup."""
    mock_auth = MagicMock()
-    mock_auth.permitted.return_value = True
+    mock_auth.authenticate = AsyncMock(return_value=_valid_identity())
+    mock_auth.authorise = AsyncMock(return_value=None)

    mock_dispatcher = AsyncMock()
    mock_dispatcher.destroy = AsyncMock()
@ -163,7 +202,10 @@ async def test_handle_exception_group_cleanup():
    async def mock_dispatcher_factory(ws, running, match_info):
        return mock_dispatcher

-    socket_endpoint = SocketEndpoint("/test", mock_auth, mock_dispatcher_factory)
+    socket_endpoint = SocketEndpoint(
+        "/test", mock_auth, mock_dispatcher_factory,
+        capability=TEST_CAP,
+    )

    request = MagicMock()
    request.query = {"token": "valid-token"}
@ -222,7 +264,8 @@ async def test_handle_exception_group_cleanup():
 async def test_handle_dispatcher_cleanup_timeout():
    """Test dispatcher cleanup with timeout."""
    mock_auth = MagicMock()
-    mock_auth.permitted.return_value = True
+    mock_auth.authenticate = AsyncMock(return_value=_valid_identity())
+    mock_auth.authorise = AsyncMock(return_value=None)

    # Mock dispatcher that takes long to destroy
    mock_dispatcher = AsyncMock()
@ -231,7 +274,10 @@ async def test_handle_dispatcher_cleanup_timeout():
    async def mock_dispatcher_factory(ws, running, match_info):
        return mock_dispatcher

-    socket_endpoint = SocketEndpoint("/test", mock_auth, mock_dispatcher_factory)
+    socket_endpoint = SocketEndpoint(
+        "/test", mock_auth, mock_dispatcher_factory,
+        capability=TEST_CAP,
+    )

    request = MagicMock()
    request.query = {"token": "valid-token"}
@ -285,49 +331,68 @@ async def test_handle_dispatcher_cleanup_timeout():

@pytest.mark.asyncio
 async def test_handle_unauthorized_request():
-    """Test handling of unauthorized requests."""
+    """A bearer that the IAM layer rejects causes the handshake to
+    fail with 401.  IamAuth surfaces an HTTPUnauthorized; the
+    endpoint propagates it.  Note that the endpoint intentionally
+    does NOT distinguish 'bad token', 'expired', 'revoked', etc. —
+    that's the IAM error-masking policy."""
    mock_auth = MagicMock()
-    mock_auth.permitted.return_value = False  # Unauthorized
-    
-    socket_endpoint = SocketEndpoint("/test", mock_auth, AsyncMock())
-    
+    mock_auth.authenticate = AsyncMock(side_effect=web.HTTPUnauthorized(
+        text='{"error":"auth failure"}',
+        content_type="application/json",
+    ))
+
+    socket_endpoint = SocketEndpoint(
+        "/test", mock_auth, AsyncMock(),
+        capability=TEST_CAP,
+    )
+
    request = MagicMock()
    request.query = {"token": "invalid-token"}
-    
+
    result = await socket_endpoint.handle(request)
-    
-    # Should return HTTP 401
+
    assert isinstance(result, web.HTTPUnauthorized)
-    
-    # Should have checked permission
-    mock_auth.permitted.assert_called_once_with("invalid-token", "socket")
+    # authenticate must have been invoked with a synthetic request
+    # carrying Bearer <the-token>.  The endpoint wraps the query-
+    # string token into an Authorization header for a uniform auth
+    # path — the IAM layer does not look at query strings directly.
+    mock_auth.authenticate.assert_called_once()
+    passed_req = mock_auth.authenticate.call_args.args[0]
+    assert passed_req.headers["Authorization"] == "Bearer invalid-token"


@pytest.mark.asyncio
 async def test_handle_missing_token():
-    """Test handling of requests with missing token."""
+    """Request with no ``token`` query param → 401 before any
+    IAM call is made (cheap short-circuit)."""
    mock_auth = MagicMock()
-    mock_auth.permitted.return_value = False
-    
-    socket_endpoint = SocketEndpoint("/test", mock_auth, AsyncMock())
-    
+    mock_auth.authenticate = AsyncMock(
+        side_effect=AssertionError(
+            "authenticate must not be invoked when no token is present"
+        ),
+    )
+
+    socket_endpoint = SocketEndpoint(
+        "/test", mock_auth, AsyncMock(),
+        capability=TEST_CAP,
+    )
+
    request = MagicMock()
    request.query = {}  # No token
-    
+
    result = await socket_endpoint.handle(request)
-    
-    # Should return HTTP 401
+
    assert isinstance(result, web.HTTPUnauthorized)
-    
-    # Should have checked permission with empty token
-    mock_auth.permitted.assert_called_once_with("", "socket")
+    mock_auth.authenticate.assert_not_called()


@pytest.mark.asyncio
 async def test_handle_websocket_already_closed():
    """Test handling when websocket is already closed."""
    mock_auth = MagicMock()
-    mock_auth.permitted.return_value = True
+    mock_auth.authenticate = AsyncMock(return_value=_valid_identity())
+    mock_auth.authorise = AsyncMock(return_value=None)

    mock_dispatcher = AsyncMock()
    mock_dispatcher.destroy = AsyncMock()
@ -335,7 +400,10 @@ async def test_handle_websocket_already_closed():
    async def mock_dispatcher_factory(ws, running, match_info):
        return mock_dispatcher

-    socket_endpoint = SocketEndpoint("/test", mock_auth, mock_dispatcher_factory)
+    socket_endpoint = SocketEndpoint(
+        "/test", mock_auth, mock_dispatcher_factory,
+        capability=TEST_CAP,
+    )

    request = MagicMock()
    request.query = {"token": "valid-token"}
--- a/tests/unit/test_text_completion/test_ollama_processor.py
+++ b/tests/unit/test_text_completion/test_ollama_processor.py
@ -15,13 +15,13 @@ from trustgraph.base import LlmResult
 class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
    """Test Ollama processor functionality"""

-    @patch('trustgraph.model.text_completion.ollama.llm.Client')
+    @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.llm_service.LlmService.__init__')
    async def test_processor_initialization_basic(self, mock_llm_init, mock_async_init, mock_client_class):
        """Test basic processor initialization"""
        # Arrange
-        mock_client = MagicMock()
+        mock_client = AsyncMock()
        mock_client_class.return_value = mock_client
        
        # Mock the parent class initialization
@ -44,13 +44,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
        assert hasattr(processor, 'llm')
        mock_client_class.assert_called_once_with(host='http://localhost:11434')

-    @patch('trustgraph.model.text_completion.ollama.llm.Client')
+    @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.llm_service.LlmService.__init__')
    async def test_generate_content_success(self, mock_llm_init, mock_async_init, mock_client_class):
        """Test successful content generation"""
        # Arrange
-        mock_client = MagicMock()
+        mock_client = AsyncMock()
        mock_response = {
            'response': 'Generated response from Ollama',
            'prompt_eval_count': 15,
@ -83,13 +83,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
        assert result.model == 'llama2'
        mock_client.generate.assert_called_once_with('llama2', "System prompt\n\nUser prompt", options={'temperature': 0.0})

-    @patch('trustgraph.model.text_completion.ollama.llm.Client')
+    @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.llm_service.LlmService.__init__')
    async def test_generate_content_generic_exception(self, mock_llm_init, mock_async_init, mock_client_class):
        """Test handling of generic exceptions"""
        # Arrange
-        mock_client = MagicMock()
+        mock_client = AsyncMock()
        mock_client.generate.side_effect = Exception("Connection error")
        mock_client_class.return_value = mock_client
        
@ -110,13 +110,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
        with pytest.raises(Exception, match="Connection error"):
            await processor.generate_content("System prompt", "User prompt")

-    @patch('trustgraph.model.text_completion.ollama.llm.Client')
+    @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.llm_service.LlmService.__init__')
    async def test_processor_initialization_with_custom_parameters(self, mock_llm_init, mock_async_init, mock_client_class):
        """Test processor initialization with custom parameters"""
        # Arrange
-        mock_client = MagicMock()
+        mock_client = AsyncMock()
        mock_client_class.return_value = mock_client
        
        mock_async_init.return_value = None
@ -137,13 +137,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
        assert processor.default_model == 'mistral'
        mock_client_class.assert_called_once_with(host='http://192.168.1.100:11434')

-    @patch('trustgraph.model.text_completion.ollama.llm.Client')
+    @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.llm_service.LlmService.__init__')
    async def test_processor_initialization_with_defaults(self, mock_llm_init, mock_async_init, mock_client_class):
        """Test processor initialization with default values"""
        # Arrange
-        mock_client = MagicMock()
+        mock_client = AsyncMock()
        mock_client_class.return_value = mock_client
        
        mock_async_init.return_value = None
@ -164,13 +164,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
        # Should use default_ollama (http://localhost:11434 or from OLLAMA_HOST env)
        mock_client_class.assert_called_once()

-    @patch('trustgraph.model.text_completion.ollama.llm.Client')
+    @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.llm_service.LlmService.__init__')
    async def test_generate_content_empty_prompts(self, mock_llm_init, mock_async_init, mock_client_class):
        """Test content generation with empty prompts"""
        # Arrange
-        mock_client = MagicMock()
+        mock_client = AsyncMock()
        mock_response = {
            'response': 'Default response',
            'prompt_eval_count': 2,
@ -205,13 +205,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
        # The prompt should be "" + "\n\n" + "" = "\n\n"
        mock_client.generate.assert_called_once_with('llama2', "\n\n", options={'temperature': 0.0})

-    @patch('trustgraph.model.text_completion.ollama.llm.Client')
+    @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.llm_service.LlmService.__init__')
    async def test_generate_content_token_counting(self, mock_llm_init, mock_async_init, mock_client_class):
        """Test token counting from Ollama response"""
        # Arrange
-        mock_client = MagicMock()
+        mock_client = AsyncMock()
        mock_response = {
            'response': 'Test response',
            'prompt_eval_count': 50,
@ -243,13 +243,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
        assert result.out_token == 25
        assert result.model == 'llama2'

-    @patch('trustgraph.model.text_completion.ollama.llm.Client')
+    @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.llm_service.LlmService.__init__')
    async def test_ollama_client_initialization(self, mock_llm_init, mock_async_init, mock_client_class):
        """Test that Ollama client is initialized correctly"""
        # Arrange
-        mock_client = MagicMock()
+        mock_client = AsyncMock()
        mock_client_class.return_value = mock_client
        
        mock_async_init.return_value = None
@ -273,13 +273,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
        # Verify processor has the client
        assert processor.llm == mock_client

-    @patch('trustgraph.model.text_completion.ollama.llm.Client')
+    @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.llm_service.LlmService.__init__')
    async def test_generate_content_prompt_construction(self, mock_llm_init, mock_async_init, mock_client_class):
        """Test prompt construction with system and user prompts"""
        # Arrange
-        mock_client = MagicMock()
+        mock_client = AsyncMock()
        mock_response = {
            'response': 'Response with system instructions',
            'prompt_eval_count': 25,
@ -312,13 +312,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
        # Verify the combined prompt
        mock_client.generate.assert_called_once_with('llama2', "You are a helpful assistant\n\nWhat is AI?", options={'temperature': 0.0})

-    @patch('trustgraph.model.text_completion.ollama.llm.Client')
+    @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.llm_service.LlmService.__init__')
    async def test_generate_content_temperature_override(self, mock_llm_init, mock_async_init, mock_client_class):
        """Test temperature parameter override functionality"""
        # Arrange
-        mock_client = MagicMock()
+        mock_client = AsyncMock()
        mock_response = {
            'response': 'Response with custom temperature',
            'prompt_eval_count': 20,
@ -360,13 +360,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
            options={'temperature': 0.8}  # Should use runtime override
        )

-    @patch('trustgraph.model.text_completion.ollama.llm.Client')
+    @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.llm_service.LlmService.__init__')
    async def test_generate_content_model_override(self, mock_llm_init, mock_async_init, mock_client_class):
        """Test model parameter override functionality"""
        # Arrange
-        mock_client = MagicMock()
+        mock_client = AsyncMock()
        mock_response = {
            'response': 'Response with custom model',
            'prompt_eval_count': 18,
@ -408,13 +408,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
            options={'temperature': 0.1}  # Should use processor default
        )

-    @patch('trustgraph.model.text_completion.ollama.llm.Client')
+    @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
    @patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
    @patch('trustgraph.base.llm_service.LlmService.__init__')
    async def test_generate_content_both_parameters_override(self, mock_llm_init, mock_async_init, mock_client_class):
        """Test overriding both model and temperature parameters simultaneously"""
        # Arrange
-        mock_client = MagicMock()
+        mock_client = AsyncMock()
        mock_response = {
            'response': 'Response with both overrides',
            'prompt_eval_count': 22,
--- a/trustgraph-base/trustgraph/api/async_socket_client.py
+++ b/trustgraph-base/trustgraph/api/async_socket_client.py
@ -49,21 +49,67 @@ class AsyncSocketClient:
            return f"ws://{url}"

    def _build_ws_url(self):
-        ws_url = f"{self.url.rstrip('/')}/api/v1/socket"
-        if self.token:
-            ws_url = f"{ws_url}?token={self.token}"
-        return ws_url
+        # /api/v1/socket uses the first-frame auth protocol — the
+        # token is sent as the first frame after connecting rather
+        # than in the URL.  This avoids browser issues with 401 on
+        # the WebSocket handshake and lets long-lived sockets
+        # refresh credentials mid-session.
+        return f"{self.url.rstrip('/')}/api/v1/socket"

    async def connect(self):
-        """Establish the persistent websocket connection."""
+        """Establish the persistent websocket connection and run the
+        first-frame auth handshake."""
        if self._connected:
            return

+        if not self.token:
+            raise ProtocolException(
+                "AsyncSocketClient requires a token for first-frame "
+                "auth against /api/v1/socket"
+            )
+
        ws_url = self._build_ws_url()
        self._connect_cm = websockets.connect(
            ws_url, ping_interval=20, ping_timeout=self.timeout
        )
        self._socket = await self._connect_cm.__aenter__()
+
+        # First-frame auth: send {"type":"auth","token":"..."} and
+        # wait for auth-ok / auth-failed.  Run before starting the
+        # reader task so the response isn't consumed by the reader's
+        # id-based routing.
+        await self._socket.send(json.dumps({
+            "type": "auth", "token": self.token,
+        }))
+        try:
+            raw = await asyncio.wait_for(
+                self._socket.recv(), timeout=self.timeout,
+            )
+        except asyncio.TimeoutError:
+            await self._socket.close()
+            raise ProtocolException("Timeout waiting for auth response")
+
+        try:
+            resp = json.loads(raw)
+        except Exception:
+            await self._socket.close()
+            raise ProtocolException(
+                f"Unexpected non-JSON auth response: {raw!r}"
+            )
+
+        if resp.get("type") == "auth-ok":
+            self.workspace = resp.get("workspace", self.workspace)
+        elif resp.get("type") == "auth-failed":
+            await self._socket.close()
+            raise ProtocolException(
+                f"auth failure: {resp.get('error', 'unknown')}"
+            )
+        else:
+            await self._socket.close()
+            raise ProtocolException(
+                f"Unexpected auth response: {resp!r}"
+            )
+
        self._connected = True
        self._reader_task = asyncio.create_task(self._reader())

--- a/trustgraph-base/trustgraph/api/socket_client.py
+++ b/trustgraph-base/trustgraph/api/socket_client.py
@ -112,10 +112,10 @@ class SocketClient:
            return f"ws://{url}"

    def _build_ws_url(self):
-        ws_url = f"{self.url.rstrip('/')}/api/v1/socket"
-        if self.token:
-            ws_url = f"{ws_url}?token={self.token}"
-        return ws_url
+        # /api/v1/socket uses the first-frame auth protocol — the
+        # token is sent as the first frame after connecting rather
+        # than in the URL.
+        return f"{self.url.rstrip('/')}/api/v1/socket"

    def _get_loop(self):
        """Get or create the event loop, reusing across calls."""
@ -132,15 +132,58 @@ class SocketClient:
        return self._loop

    async def _ensure_connected(self):
-        """Lazily establish the persistent websocket connection."""
+        """Lazily establish the persistent websocket connection and
+        run the first-frame auth handshake."""
        if self._connected:
            return

+        if not self.token:
+            raise ProtocolException(
+                "SocketClient requires a token for first-frame auth "
+                "against /api/v1/socket"
+            )
+
        ws_url = self._build_ws_url()
        self._connect_cm = websockets.connect(
            ws_url, ping_interval=20, ping_timeout=self.timeout
        )
        self._socket = await self._connect_cm.__aenter__()
+
+        # First-frame auth — run before starting the reader so the
+        # auth-ok / auth-failed response isn't consumed by the reader
+        # loop's id-based routing.
+        await self._socket.send(json.dumps({
+            "type": "auth", "token": self.token,
+        }))
+        try:
+            raw = await asyncio.wait_for(
+                self._socket.recv(), timeout=self.timeout,
+            )
+        except asyncio.TimeoutError:
+            await self._socket.close()
+            raise ProtocolException("Timeout waiting for auth response")
+
+        try:
+            resp = json.loads(raw)
+        except Exception:
+            await self._socket.close()
+            raise ProtocolException(
+                f"Unexpected non-JSON auth response: {raw!r}"
+            )
+
+        if resp.get("type") == "auth-ok":
+            self.workspace = resp.get("workspace", self.workspace)
+        elif resp.get("type") == "auth-failed":
+            await self._socket.close()
+            raise ProtocolException(
+                f"auth failure: {resp.get('error', 'unknown')}"
+            )
+        else:
+            await self._socket.close()
+            raise ProtocolException(
+                f"Unexpected auth response: {resp!r}"
+            )
+
        self._connected = True
        self._reader_task = asyncio.create_task(self._reader())

--- a/trustgraph-base/trustgraph/base/config_client.py
+++ b/trustgraph-base/trustgraph/base/config_client.py
@ -84,6 +84,18 @@ class ConfigClient(RequestResponse):
        )
        return resp.directory

+    async def get_all(self, workspace, timeout=CONFIG_TIMEOUT):
+        """Return every config entry in ``workspace`` as a nested dict
+        ``{type: {key: value}}``.  Values are returned as the raw
+        strings stored by config-svc (typically JSON); callers parse
+        as needed.  An empty dict means the workspace has no config."""
+        resp = await self._request(
+            operation="config",
+            workspace=workspace,
+            timeout=timeout,
+        )
+        return resp.config
+
    async def workspaces_for_type(self, type, timeout=CONFIG_TIMEOUT):
        """Return the set of distinct workspaces with any config of
        the given type."""
--- a/trustgraph-base/trustgraph/base/iam_client.py
+++ b/trustgraph-base/trustgraph/base/iam_client.py
@ -0,0 +1,342 @@
+
+import json
+
+from . request_response_spec import RequestResponse, RequestResponseSpec
+from .. schema import (
+    IamRequest, IamResponse,
+    UserInput, WorkspaceInput, ApiKeyInput,
+)
+
+IAM_TIMEOUT = 10
+
+
+class IamClient(RequestResponse):
+    """Client for the IAM service request/response pub/sub protocol.
+
+    Mirrors ``ConfigClient``: a thin wrapper around ``RequestResponse``
+    that knows the IAM request / response schemas.  Only the subset of
+    operations actually implemented by the server today has helper
+    methods here; callers that need an unimplemented operation can
+    build ``IamRequest`` and call ``request()`` directly.
+    """
+
+    async def _request(self, timeout=IAM_TIMEOUT, **kwargs):
+        resp = await self.request(
+            IamRequest(**kwargs),
+            timeout=timeout,
+        )
+        if resp.error:
+            raise RuntimeError(
+                f"{resp.error.type}: {resp.error.message}"
+            )
+        return resp
+
+    async def bootstrap(self, timeout=IAM_TIMEOUT):
+        """Initial-run IAM self-seed.  Returns a tuple of
+        ``(admin_user_id, admin_api_key_plaintext)``.  Both are empty
+        strings on repeat calls — the operation is a no-op once the
+        IAM tables are populated."""
+        resp = await self._request(
+            operation="bootstrap", timeout=timeout,
+        )
+        return resp.bootstrap_admin_user_id, resp.bootstrap_admin_api_key
+
+    async def bootstrap_status(self, timeout=IAM_TIMEOUT):
+        """Returns whether an unconsumed ``bootstrap`` call would
+        currently succeed (i.e. iam-svc is in ``bootstrap`` mode and
+        its tables are empty).  Side-effect-free; intended for first-
+        run UX so a UI can decide whether to render setup."""
+        resp = await self._request(
+            operation="bootstrap-status", timeout=timeout,
+        )
+        return resp.bootstrap_available
+
+    async def whoami(self, actor, timeout=IAM_TIMEOUT):
+        """Return the user record for ``actor`` (the authenticated
+        caller's handle).  AUTHENTICATED-only; no capability check —
+        every authenticated user can read themselves."""
+        resp = await self._request(
+            operation="whoami",
+            actor=actor,
+            timeout=timeout,
+        )
+        return resp.user
+
+    async def resolve_api_key(self, api_key, timeout=IAM_TIMEOUT):
+        """Resolve a plaintext API key to its identity triple.
+
+        Returns ``(user_id, workspace, roles)`` or raises
+        ``RuntimeError`` with error type ``auth-failed`` if the key is
+        unknown / expired / revoked.
+
+        Note: the ``roles`` value is a regime-internal hint and is
+        not used by the gateway directly under the IAM contract;
+        all authorisation decisions go through ``authorise()``.
+        Returned here only for backward compatibility with callers
+        that haven't migrated."""
+        resp = await self._request(
+            operation="resolve-api-key",
+            api_key=api_key,
+            timeout=timeout,
+        )
+        return (
+            resp.resolved_user_id,
+            resp.resolved_workspace,
+            list(resp.resolved_roles),
+        )
+
+    async def authorise(self, identity_handle, capability,
+                        resource, parameters, timeout=IAM_TIMEOUT):
+        """Ask the IAM regime whether ``identity_handle`` may perform
+        ``capability`` on ``resource`` given ``parameters``.
+
+        Implements the contract ``authorise(identity, capability,
+        resource, parameters) → (decision, ttl)``.  Returns a tuple
+        ``(allow: bool, ttl_seconds: int)``.  The TTL is the
+        regime's suggested cache lifetime for this decision; the
+        gateway honours it (clamped above by gateway-side policy)."""
+        resp = await self._request(
+            operation="authorise",
+            user_id=identity_handle,
+            capability=capability,
+            resource_json=json.dumps(resource or {}, sort_keys=True),
+            parameters_json=json.dumps(parameters or {}, sort_keys=True),
+            timeout=timeout,
+        )
+        return resp.decision_allow, resp.decision_ttl_seconds
+
+    async def authorise_many(self, identity_handle, checks,
+                             timeout=IAM_TIMEOUT):
+        """Bulk authorise.  ``checks`` is a list of dicts each
+        carrying ``capability``, ``resource``, and ``parameters``.
+        Returns a list of ``(allow, ttl)`` tuples in the same order."""
+        resp = await self._request(
+            operation="authorise-many",
+            user_id=identity_handle,
+            authorise_checks=json.dumps(list(checks), sort_keys=True),
+            timeout=timeout,
+        )
+        decisions = json.loads(resp.decisions_json or "[]")
+        return [(d.get("allow", False), d.get("ttl", 0)) for d in decisions]
+
+    async def create_user(self, workspace, user, actor="",
+                          timeout=IAM_TIMEOUT):
+        """Create a user.  ``user`` is a ``UserInput``."""
+        resp = await self._request(
+            operation="create-user",
+            workspace=workspace,
+            actor=actor,
+            user=user,
+            timeout=timeout,
+        )
+        return resp.user
+
+    async def list_users(self, workspace, actor="", timeout=IAM_TIMEOUT):
+        resp = await self._request(
+            operation="list-users",
+            workspace=workspace,
+            actor=actor,
+            timeout=timeout,
+        )
+        return list(resp.users)
+
+    async def create_api_key(self, workspace, key, actor="",
+                             timeout=IAM_TIMEOUT):
+        """Create an API key.  ``key`` is an ``ApiKeyInput``.  Returns
+        ``(plaintext, record)`` — plaintext is returned once and the
+        caller is responsible for surfacing it to the operator."""
+        resp = await self._request(
+            operation="create-api-key",
+            workspace=workspace,
+            actor=actor,
+            key=key,
+            timeout=timeout,
+        )
+        return resp.api_key_plaintext, resp.api_key
+
+    async def list_api_keys(self, workspace, user_id, actor="",
+                            timeout=IAM_TIMEOUT):
+        resp = await self._request(
+            operation="list-api-keys",
+            workspace=workspace,
+            actor=actor,
+            user_id=user_id,
+            timeout=timeout,
+        )
+        return list(resp.api_keys)
+
+    async def revoke_api_key(self, workspace, key_id, actor="",
+                             timeout=IAM_TIMEOUT):
+        await self._request(
+            operation="revoke-api-key",
+            workspace=workspace,
+            actor=actor,
+            key_id=key_id,
+            timeout=timeout,
+        )
+
+    async def login(self, username, password, workspace="",
+                    timeout=IAM_TIMEOUT):
+        """Validate credentials and return ``(jwt, expires_iso)``.
+        ``workspace`` is optional; defaults at the server to the
+        OSS default workspace."""
+        resp = await self._request(
+            operation="login",
+            workspace=workspace,
+            username=username,
+            password=password,
+            timeout=timeout,
+        )
+        return resp.jwt, resp.jwt_expires
+
+    async def get_signing_key_public(self, timeout=IAM_TIMEOUT):
+        """Return the active JWT signing public key in PEM.  The
+        gateway calls this at startup and caches the result."""
+        resp = await self._request(
+            operation="get-signing-key-public",
+            timeout=timeout,
+        )
+        return resp.signing_key_public
+
+    async def change_password(self, user_id, current_password,
+                              new_password, timeout=IAM_TIMEOUT):
+        await self._request(
+            operation="change-password",
+            user_id=user_id,
+            password=current_password,
+            new_password=new_password,
+            timeout=timeout,
+        )
+
+    async def reset_password(self, workspace, user_id, actor="",
+                             timeout=IAM_TIMEOUT):
+        """Admin-driven password reset.  Returns the plaintext
+        temporary password (returned once)."""
+        resp = await self._request(
+            operation="reset-password",
+            workspace=workspace,
+            actor=actor,
+            user_id=user_id,
+            timeout=timeout,
+        )
+        return resp.temporary_password
+
+    async def get_user(self, workspace, user_id, actor="",
+                       timeout=IAM_TIMEOUT):
+        resp = await self._request(
+            operation="get-user",
+            workspace=workspace,
+            actor=actor,
+            user_id=user_id,
+            timeout=timeout,
+        )
+        return resp.user
+
+    async def update_user(self, workspace, user_id, user, actor="",
+                          timeout=IAM_TIMEOUT):
+        resp = await self._request(
+            operation="update-user",
+            workspace=workspace,
+            actor=actor,
+            user_id=user_id,
+            user=user,
+            timeout=timeout,
+        )
+        return resp.user
+
+    async def disable_user(self, workspace, user_id, actor="",
+                           timeout=IAM_TIMEOUT):
+        await self._request(
+            operation="disable-user",
+            workspace=workspace,
+            actor=actor,
+            user_id=user_id,
+            timeout=timeout,
+        )
+
+    async def enable_user(self, workspace, user_id, actor="",
+                          timeout=IAM_TIMEOUT):
+        await self._request(
+            operation="enable-user",
+            workspace=workspace,
+            actor=actor,
+            user_id=user_id,
+            timeout=timeout,
+        )
+
+    async def delete_user(self, workspace, user_id, actor="",
+                          timeout=IAM_TIMEOUT):
+        await self._request(
+            operation="delete-user",
+            workspace=workspace,
+            actor=actor,
+            user_id=user_id,
+            timeout=timeout,
+        )
+
+    async def create_workspace(self, workspace_record, actor="",
+                               timeout=IAM_TIMEOUT):
+        resp = await self._request(
+            operation="create-workspace",
+            actor=actor,
+            workspace_record=workspace_record,
+            timeout=timeout,
+        )
+        return resp.workspace
+
+    async def list_workspaces(self, actor="", timeout=IAM_TIMEOUT):
+        resp = await self._request(
+            operation="list-workspaces",
+            actor=actor,
+            timeout=timeout,
+        )
+        return list(resp.workspaces)
+
+    async def get_workspace(self, workspace_id, actor="",
+                            timeout=IAM_TIMEOUT):
+        from ..schema import WorkspaceInput
+        resp = await self._request(
+            operation="get-workspace",
+            actor=actor,
+            workspace_record=WorkspaceInput(id=workspace_id),
+            timeout=timeout,
+        )
+        return resp.workspace
+
+    async def update_workspace(self, workspace_record, actor="",
+                               timeout=IAM_TIMEOUT):
+        resp = await self._request(
+            operation="update-workspace",
+            actor=actor,
+            workspace_record=workspace_record,
+            timeout=timeout,
+        )
+        return resp.workspace
+
+    async def disable_workspace(self, workspace_id, actor="",
+                                timeout=IAM_TIMEOUT):
+        from ..schema import WorkspaceInput
+        await self._request(
+            operation="disable-workspace",
+            actor=actor,
+            workspace_record=WorkspaceInput(id=workspace_id),
+            timeout=timeout,
+        )
+
+    async def rotate_signing_key(self, actor="", timeout=IAM_TIMEOUT):
+        await self._request(
+            operation="rotate-signing-key",
+            actor=actor,
+            timeout=timeout,
+        )
+
+
+class IamClientSpec(RequestResponseSpec):
+    def __init__(self, request_name, response_name):
+        super().__init__(
+            request_name=request_name,
+            request_schema=IamRequest,
+            response_name=response_name,
+            response_schema=IamResponse,
+            impl=IamClient,
+        )
--- a/trustgraph-base/trustgraph/messaging/init.py
+++ b/trustgraph-base/trustgraph/messaging/init.py
@ -15,6 +15,7 @@ from .translators.library import LibraryRequestTranslator, LibraryResponseTransl
 from .translators.document_loading import DocumentTranslator, TextDocumentTranslator
 from .translators.config import ConfigRequestTranslator, ConfigResponseTranslator
 from .translators.flow import FlowRequestTranslator, FlowResponseTranslator
+from .translators.iam import IamRequestTranslator, IamResponseTranslator
 from .translators.prompt import PromptRequestTranslator, PromptResponseTranslator
 from .translators.tool import ToolRequestTranslator, ToolResponseTranslator
 from .translators.embeddings_query import (
@ -85,11 +86,17 @@ TranslatorRegistry.register_service(
 )

 TranslatorRegistry.register_service(
-    "flow", 
-    FlowRequestTranslator(), 
+    "flow",
+    FlowRequestTranslator(),
    FlowResponseTranslator()
 )

+TranslatorRegistry.register_service(
+    "iam",
+    IamRequestTranslator(),
+    IamResponseTranslator()
+)
+
 TranslatorRegistry.register_service(
    "prompt", 
    PromptRequestTranslator(), 
--- a/trustgraph-base/trustgraph/messaging/translators/iam.py
+++ b/trustgraph-base/trustgraph/messaging/translators/iam.py
@ -0,0 +1,198 @@
+from typing import Dict, Any, Tuple
+
+from ...schema import IamRequest, IamResponse
+from ...schema import (
+    UserInput, UserRecord,
+    WorkspaceInput, WorkspaceRecord,
+    ApiKeyInput, ApiKeyRecord,
+)
+from .base import MessageTranslator
+
+
+def _user_input_from_dict(d):
+    if d is None:
+        return None
+    return UserInput(
+        username=d.get("username", ""),
+        name=d.get("name", ""),
+        email=d.get("email", ""),
+        password=d.get("password", ""),
+        roles=list(d.get("roles", [])),
+        enabled=d.get("enabled", True),
+        must_change_password=d.get("must_change_password", False),
+    )
+
+
+def _workspace_input_from_dict(d):
+    if d is None:
+        return None
+    return WorkspaceInput(
+        id=d.get("id", ""),
+        name=d.get("name", ""),
+        enabled=d.get("enabled", True),
+    )
+
+
+def _api_key_input_from_dict(d):
+    if d is None:
+        return None
+    return ApiKeyInput(
+        user_id=d.get("user_id", ""),
+        name=d.get("name", ""),
+        expires=d.get("expires", ""),
+    )
+
+
+def _user_record_to_dict(r):
+    if r is None:
+        return None
+    return {
+        "id": r.id,
+        "workspace": r.workspace,
+        "username": r.username,
+        "name": r.name,
+        "email": r.email,
+        "roles": list(r.roles),
+        "enabled": r.enabled,
+        "must_change_password": r.must_change_password,
+        "created": r.created,
+    }
+
+
+def _workspace_record_to_dict(r):
+    if r is None:
+        return None
+    return {
+        "id": r.id,
+        "name": r.name,
+        "enabled": r.enabled,
+        "created": r.created,
+    }
+
+
+def _api_key_record_to_dict(r):
+    if r is None:
+        return None
+    return {
+        "id": r.id,
+        "user_id": r.user_id,
+        "name": r.name,
+        "prefix": r.prefix,
+        "expires": r.expires,
+        "created": r.created,
+        "last_used": r.last_used,
+    }
+
+
+class IamRequestTranslator(MessageTranslator):
+
+    def decode(self, data: Dict[str, Any]) -> IamRequest:
+        return IamRequest(
+            operation=data.get("operation", ""),
+            workspace=data.get("workspace", ""),
+            actor=data.get("actor", ""),
+            user_id=data.get("user_id", ""),
+            username=data.get("username", ""),
+            key_id=data.get("key_id", ""),
+            api_key=data.get("api_key", ""),
+            password=data.get("password", ""),
+            new_password=data.get("new_password", ""),
+            user=_user_input_from_dict(data.get("user")),
+            workspace_record=_workspace_input_from_dict(
+                data.get("workspace_record")
+            ),
+            key=_api_key_input_from_dict(data.get("key")),
+        )
+
+    def encode(self, obj: IamRequest) -> Dict[str, Any]:
+        result = {"operation": obj.operation}
+        for fname in (
+                "workspace", "actor", "user_id", "username", "key_id",
+                "api_key", "password", "new_password",
+        ):
+            v = getattr(obj, fname, "")
+            if v:
+                result[fname] = v
+        if obj.user is not None:
+            result["user"] = {
+                "username": obj.user.username,
+                "name": obj.user.name,
+                "email": obj.user.email,
+                "password": obj.user.password,
+                "roles": list(obj.user.roles),
+                "enabled": obj.user.enabled,
+                "must_change_password": obj.user.must_change_password,
+            }
+        if obj.workspace_record is not None:
+            result["workspace_record"] = {
+                "id": obj.workspace_record.id,
+                "name": obj.workspace_record.name,
+                "enabled": obj.workspace_record.enabled,
+            }
+        if obj.key is not None:
+            result["key"] = {
+                "user_id": obj.key.user_id,
+                "name": obj.key.name,
+                "expires": obj.key.expires,
+            }
+        return result
+
+
+class IamResponseTranslator(MessageTranslator):
+
+    def decode(self, data: Dict[str, Any]) -> IamResponse:
+        raise NotImplementedError(
+            "IamResponse is a server-produced message; no HTTP→schema "
+            "path is needed"
+        )
+
+    def encode(self, obj: IamResponse) -> Dict[str, Any]:
+        result: Dict[str, Any] = {}
+
+        if obj.user is not None:
+            result["user"] = _user_record_to_dict(obj.user)
+        if obj.users:
+            result["users"] = [_user_record_to_dict(u) for u in obj.users]
+        if obj.workspace is not None:
+            result["workspace"] = _workspace_record_to_dict(obj.workspace)
+        if obj.workspaces:
+            result["workspaces"] = [
+                _workspace_record_to_dict(w) for w in obj.workspaces
+            ]
+        if obj.api_key_plaintext:
+            result["api_key_plaintext"] = obj.api_key_plaintext
+        if obj.api_key is not None:
+            result["api_key"] = _api_key_record_to_dict(obj.api_key)
+        if obj.api_keys:
+            result["api_keys"] = [
+                _api_key_record_to_dict(k) for k in obj.api_keys
+            ]
+        if obj.jwt:
+            result["jwt"] = obj.jwt
+        if obj.jwt_expires:
+            result["jwt_expires"] = obj.jwt_expires
+        if obj.signing_key_public:
+            result["signing_key_public"] = obj.signing_key_public
+        if obj.resolved_user_id:
+            result["resolved_user_id"] = obj.resolved_user_id
+        if obj.resolved_workspace:
+            result["resolved_workspace"] = obj.resolved_workspace
+        if obj.resolved_roles:
+            result["resolved_roles"] = list(obj.resolved_roles)
+        if obj.temporary_password:
+            result["temporary_password"] = obj.temporary_password
+        if obj.bootstrap_admin_user_id:
+            result["bootstrap_admin_user_id"] = obj.bootstrap_admin_user_id
+        if obj.bootstrap_admin_api_key:
+            result["bootstrap_admin_api_key"] = obj.bootstrap_admin_api_key
+        # bootstrap-status: emit unconditionally — the false case is
+        # meaningful for UIs deciding whether to render first-run
+        # setup, so it can't be dropped by a truthy-only filter.
+        result["bootstrap_available"] = bool(obj.bootstrap_available)
+
+        return result
+
+    def encode_with_completion(
+            self, obj: IamResponse,
+    ) -> Tuple[Dict[str, Any], bool]:
+        return self.encode(obj), True
--- a/trustgraph-base/trustgraph/schema/services/init.py
+++ b/trustgraph-base/trustgraph/schema/services/init.py
@ -5,6 +5,7 @@ from .agent import *
 from .flow import *
 from .prompt import *
 from .config import *
+from .iam import *
 from .library import *
 from .lookup import *
 from .nlp_query import *
--- a/trustgraph-base/trustgraph/schema/services/iam.py
+++ b/trustgraph-base/trustgraph/schema/services/iam.py
@ -0,0 +1,173 @@
+
+from dataclasses import dataclass, field
+
+from ..core.topic import queue
+from ..core.primitives import Error
+
+############################################################################
+
+# IAM service — see docs/tech-specs/iam-protocol.md for the full protocol.
+#
+# Transport: request/response pub/sub, correlated by the `id` message
+# property.  Caller is the API gateway only; the IAM service trusts
+# the bus per the enforcement-boundary policy (no per-request auth
+# against the caller).
+
+
+@dataclass
+class UserInput:
+    username: str = ""
+    name: str = ""
+    email: str = ""
+    # Only populated on create-user; never on update-user.
+    password: str = ""
+    roles: list[str] = field(default_factory=list)
+    enabled: bool = True
+    must_change_password: bool = False
+
+
+@dataclass
+class UserRecord:
+    id: str = ""
+    workspace: str = ""
+    username: str = ""
+    name: str = ""
+    email: str = ""
+    roles: list[str] = field(default_factory=list)
+    enabled: bool = True
+    must_change_password: bool = False
+    created: str = ""
+
+
+@dataclass
+class WorkspaceInput:
+    id: str = ""
+    name: str = ""
+    enabled: bool = True
+
+
+@dataclass
+class WorkspaceRecord:
+    id: str = ""
+    name: str = ""
+    enabled: bool = True
+    created: str = ""
+
+
+@dataclass
+class ApiKeyInput:
+    user_id: str = ""
+    name: str = ""
+    expires: str = ""
+
+
+@dataclass
+class ApiKeyRecord:
+    id: str = ""
+    user_id: str = ""
+    name: str = ""
+    # First 4 chars of the plaintext token, for operator identification
+    # in list-api-keys.  Never enough to reconstruct the key.
+    prefix: str = ""
+    expires: str = ""
+    created: str = ""
+    last_used: str = ""
+
+
+@dataclass
+class IamRequest:
+    operation: str = ""
+
+    # Workspace scope.  Required on workspace-scoped operations;
+    # omitted for system-level ops (workspace CRUD, signing-key
+    # ops, bootstrap, resolve-api-key, login).
+    workspace: str = ""
+
+    # Acting user id for audit.  Empty for internal-origin and for
+    # operations that resolve an identity (login, resolve-api-key).
+    actor: str = ""
+
+    user_id: str = ""
+    username: str = ""
+    key_id: str = ""
+    api_key: str = ""
+
+    password: str = ""
+    new_password: str = ""
+
+    user: UserInput | None = None
+    workspace_record: WorkspaceInput | None = None
+    key: ApiKeyInput | None = None
+
+    # ---- authorise / authorise-many inputs ----
+    # Capability string from the vocabulary in capabilities.md.
+    capability: str = ""
+    # Resource identifier as JSON.  See the IAM contract spec for
+    # the resource-component vocabulary.  An empty dict denotes a
+    # system-level resource.
+    resource_json: str = ""
+    # Operation parameters as JSON.  Decision-relevant fields the
+    # operation supplied that are not part of the resource address
+    # (e.g. workspace association on create-user).
+    parameters_json: str = ""
+    # For authorise-many: a JSON-serialised list of
+    # {"capability": str, "resource": dict, "parameters": dict}.
+    authorise_checks: str = ""
+
+
+@dataclass
+class IamResponse:
+    user: UserRecord | None = None
+    users: list[UserRecord] = field(default_factory=list)
+
+    workspace: WorkspaceRecord | None = None
+    workspaces: list[WorkspaceRecord] = field(default_factory=list)
+
+    # create-api-key returns the plaintext once; never populated
+    # on any other operation.
+    api_key_plaintext: str = ""
+    api_key: ApiKeyRecord | None = None
+    api_keys: list[ApiKeyRecord] = field(default_factory=list)
+
+    # login, rotate-signing-key
+    jwt: str = ""
+    jwt_expires: str = ""
+
+    # get-signing-key-public
+    signing_key_public: str = ""
+
+    # resolve-api-key
+    resolved_user_id: str = ""
+    resolved_workspace: str = ""
+    resolved_roles: list[str] = field(default_factory=list)
+
+    # reset-password
+    temporary_password: str = ""
+
+    # bootstrap
+    bootstrap_admin_user_id: str = ""
+    bootstrap_admin_api_key: str = ""
+
+    # bootstrap-status — true iff iam-svc is in 'bootstrap' mode with
+    # empty tables, i.e. an unconsumed bootstrap call would succeed.
+    bootstrap_available: bool = False
+
+    # ---- authorise / authorise-many outputs ----
+    # authorise: the regime's allow / deny verdict.
+    decision_allow: bool = False
+    # Cache TTL the regime suggests, in seconds.  Gateway respects
+    # this for both allow and deny decisions; bounded above by
+    # gateway-side policy (typically <= 60s).
+    decision_ttl_seconds: int = 0
+    # authorise-many: a JSON-serialised list of {"allow": bool,
+    # "ttl": int} in the same order as the request's
+    # authorise_checks.
+    decisions_json: str = ""
+
+    error: Error | None = None
+
+
+iam_request_queue = queue('iam', cls='request')
+iam_response_queue = queue('iam', cls='response')
+
+############################################################################
--- a/trustgraph-cli/pyproject.toml
+++ b/trustgraph-cli/pyproject.toml
@ -40,7 +40,22 @@ tg-get-flow-blueprint = "trustgraph.cli.get_flow_blueprint:main"
 tg-get-kg-core = "trustgraph.cli.get_kg_core:main"
 tg-get-document-content = "trustgraph.cli.get_document_content:main"
 tg-graph-to-turtle = "trustgraph.cli.graph_to_turtle:main"
-tg-init-trustgraph = "trustgraph.cli.init_trustgraph:main"
+tg-bootstrap-iam = "trustgraph.cli.bootstrap_iam:main"
+tg-login = "trustgraph.cli.login:main"
+tg-create-user = "trustgraph.cli.create_user:main"
+tg-list-users = "trustgraph.cli.list_users:main"
+tg-whoami = "trustgraph.cli.whoami:main"
+tg-update-user = "trustgraph.cli.update_user:main"
+tg-disable-user = "trustgraph.cli.disable_user:main"
+tg-enable-user = "trustgraph.cli.enable_user:main"
+tg-delete-user = "trustgraph.cli.delete_user:main"
+tg-change-password = "trustgraph.cli.change_password:main"
+tg-reset-password = "trustgraph.cli.reset_password:main"
+tg-create-api-key = "trustgraph.cli.create_api_key:main"
+tg-list-api-keys = "trustgraph.cli.list_api_keys:main"
+tg-revoke-api-key = "trustgraph.cli.revoke_api_key:main"
+tg-list-workspaces = "trustgraph.cli.list_workspaces:main"
+tg-create-workspace = "trustgraph.cli.create_workspace:main"
 tg-invoke-agent = "trustgraph.cli.invoke_agent:main"
 tg-invoke-document-rag = "trustgraph.cli.invoke_document_rag:main"
 tg-invoke-graph-rag = "trustgraph.cli.invoke_graph_rag:main"
--- a/trustgraph-cli/trustgraph/cli/_iam.py
+++ b/trustgraph-cli/trustgraph/cli/_iam.py
@ -0,0 +1,75 @@
+"""
+Shared helpers for IAM CLI tools.
+
+All IAM operations go through the gateway's ``/api/v1/iam`` forwarder,
+with the three public auth operations (``login``, ``bootstrap``,
+``change-password``) served via ``/api/v1/auth/...`` instead.  These
+helpers encapsulate the HTTP plumbing so each CLI can stay focused
+on its own argument parsing and output formatting.
+"""
+
+import json
+import os
+import sys
+
+import requests
+
+
+DEFAULT_URL = os.getenv("TRUSTGRAPH_URL", "http://localhost:8088/")
+DEFAULT_TOKEN = os.getenv("TRUSTGRAPH_TOKEN", None)
+
+
+def _fmt_error(resp_json):
+    err = resp_json.get("error", {})
+    if isinstance(err, dict):
+        t = err.get("type", "")
+        m = err.get("message", "")
+        return f"{t}: {m}" if t else m or "error"
+    return str(err)
+
+
+def _post(url, path, token, body):
+    endpoint = url.rstrip("/") + path
+    headers = {"Content-Type": "application/json"}
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+
+    resp = requests.post(
+        endpoint, headers=headers, data=json.dumps(body),
+    )
+
+    if resp.status_code != 200:
+        try:
+            payload = resp.json()
+            detail = _fmt_error(payload)
+        except Exception:
+            detail = resp.text
+        raise RuntimeError(f"HTTP {resp.status_code}: {detail}")
+
+    body = resp.json()
+    if "error" in body:
+        raise RuntimeError(_fmt_error(body))
+    return body
+
+
+def call_iam(url, token, request):
+    """Forward an IAM request through ``/api/v1/iam``.  ``request`` is
+    the ``IamRequest`` dict shape."""
+    return _post(url, "/api/v1/iam", token, request)
+
+
+def call_auth(url, path, token, body):
+    """Hit one of the public auth endpoints
+    (``/api/v1/auth/login``, ``/api/v1/auth/change-password``, etc.).
+    ``token`` is optional — login and bootstrap don't need one."""
+    return _post(url, path, token, body)
+
+
+def run_main(fn, parser):
+    """Standard error-handling wrapper for CLI main() bodies."""
+    args = parser.parse_args()
+    try:
+        fn(args)
+    except Exception as e:
+        print("Exception:", e, file=sys.stderr, flush=True)
+        sys.exit(1)
--- a/trustgraph-cli/trustgraph/cli/bootstrap_iam.py
+++ b/trustgraph-cli/trustgraph/cli/bootstrap_iam.py
@ -0,0 +1,94 @@
+"""
+Bootstraps the IAM service.  Only works when iam-svc is running in
+bootstrap mode with empty tables.  Prints the initial admin API key
+to stdout.
+
+This is a one-time, trust-sensitive operation.  The resulting token
+is shown once and never again — capture it on use.  Rotate and
+revoke it as soon as a real admin API key has been issued.
+"""
+
+import argparse
+import json
+import os
+import sys
+
+import requests
+
+default_url = os.getenv("TRUSTGRAPH_URL", "http://localhost:8088/")
+
+
+def bootstrap(url):
+
+    # Unauthenticated public endpoint — IAM refuses the bootstrap
+    # operation unless the service is running in bootstrap mode with
+    # empty tables, so the safety gate lives on the server side.
+    endpoint = url.rstrip("/") + "/api/v1/auth/bootstrap"
+
+    headers = {"Content-Type": "application/json"}
+
+    resp = requests.post(
+        endpoint,
+        headers=headers,
+        data=json.dumps({}),
+    )
+
+    if resp.status_code != 200:
+        raise RuntimeError(
+            f"HTTP {resp.status_code}: {resp.text}"
+        )
+
+    body = resp.json()
+
+    if "error" in body:
+        raise RuntimeError(
+            f"IAM {body['error'].get('type', 'error')}: "
+            f"{body['error'].get('message', '')}"
+        )
+
+    api_key = body.get("bootstrap_admin_api_key")
+    user_id = body.get("bootstrap_admin_user_id")
+
+    if not api_key:
+        raise RuntimeError(
+            "IAM response did not contain a bootstrap token — the "
+            "service may already be bootstrapped, or may be running "
+            "in token mode."
+        )
+
+    return user_id, api_key
+
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        prog="tg-bootstrap-iam",
+        description=__doc__,
+    )
+
+    parser.add_argument(
+        "-u", "--api-url",
+        default=default_url,
+        help=f"API URL (default: {default_url})",
+    )
+
+    args = parser.parse_args()
+
+    try:
+        user_id, api_key = bootstrap(args.api_url)
+    except Exception as e:
+        print("Exception:", e, file=sys.stderr, flush=True)
+        sys.exit(1)
+
+    # Stdout gets machine-readable output (the key).  Any operator
+    # context goes to stderr.
+    print(f"Admin user id: {user_id}", file=sys.stderr)
+    print(
+        "Admin API key (shown once, capture now):",
+        file=sys.stderr,
+    )
+    print(api_key)
+
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/change_password.py
+++ b/trustgraph-cli/trustgraph/cli/change_password.py
@ -0,0 +1,46 @@
+"""
+Change your own password.  Requires the current password.
+"""
+
+import argparse
+import getpass
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_auth, run_main
+
+
+def do_change_password(args):
+    current = args.current or getpass.getpass("Current password: ")
+    new = args.new or getpass.getpass("New password: ")
+
+    call_auth(
+        args.api_url, "/api/v1/auth/change-password", args.token,
+        {"current_password": current, "new_password": new},
+    )
+    print("Password changed.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="tg-change-password", description=__doc__,
+    )
+    parser.add_argument(
+        "-u", "--api-url", default=DEFAULT_URL,
+        help=f"API URL (default: {DEFAULT_URL})",
+    )
+    parser.add_argument(
+        "-t", "--token", default=DEFAULT_TOKEN,
+        help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+    )
+    parser.add_argument(
+        "--current", default=None,
+        help="Current password (prompted if omitted)",
+    )
+    parser.add_argument(
+        "--new", default=None,
+        help="New password (prompted if omitted)",
+    )
+    run_main(do_change_password, parser)
+
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/create_api_key.py
+++ b/trustgraph-cli/trustgraph/cli/create_api_key.py
@ -0,0 +1,71 @@
+"""
+Create an API key for a user.  Prints the plaintext key to stdout —
+shown once only.
+"""
+
+import argparse
+import sys
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_create_api_key(args):
+    key = {
+        "user_id": args.user_id,
+        "name": args.name,
+    }
+    if args.expires:
+        key["expires"] = args.expires
+
+    req = {"operation": "create-api-key", "key": key}
+    if args.workspace:
+        req["workspace"] = args.workspace
+    resp = call_iam(args.api_url, args.token, req)
+
+    plaintext = resp.get("api_key_plaintext", "")
+    rec = resp.get("api_key", {})
+    print(f"Key id: {rec.get('id', '')}", file=sys.stderr)
+    print(f"Name: {rec.get('name', '')}", file=sys.stderr)
+    print(f"Prefix: {rec.get('prefix', '')}", file=sys.stderr)
+    print(
+        "API key (shown once, capture now):", file=sys.stderr,
+    )
+    print(plaintext)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="tg-create-api-key", description=__doc__,
+    )
+    parser.add_argument(
+        "-u", "--api-url", default=DEFAULT_URL,
+        help=f"API URL (default: {DEFAULT_URL})",
+    )
+    parser.add_argument(
+        "-t", "--token", default=DEFAULT_TOKEN,
+        help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+    )
+    parser.add_argument(
+        "--user-id", required=True,
+        help="Owner user id",
+    )
+    parser.add_argument(
+        "--name", required=True,
+        help="Operator-facing label (e.g. 'laptop', 'ci')",
+    )
+    parser.add_argument(
+        "--expires", default=None,
+        help="ISO-8601 expiry (optional; empty = no expiry)",
+    )
+    parser.add_argument(
+        "-w", "--workspace", default=None,
+        help=(
+            "Target workspace (admin only; defaults to caller's "
+            "assigned workspace)"
+        ),
+    )
+    run_main(do_create_api_key, parser)
+
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/create_user.py
+++ b/trustgraph-cli/trustgraph/cli/create_user.py
@ -0,0 +1,87 @@
+"""
+Create a user in the caller's workspace.  Prints the new user id.
+"""
+
+import argparse
+import getpass
+import sys
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_create_user(args):
+    password = args.password
+    if not password:
+        password = getpass.getpass(
+            f"Password for new user {args.username}: "
+        )
+
+    user = {
+        "username": args.username,
+        "password": password,
+        "roles": args.roles,
+    }
+    if args.name:
+        user["name"] = args.name
+    if args.email:
+        user["email"] = args.email
+    if args.must_change_password:
+        user["must_change_password"] = True
+
+    req = {"operation": "create-user", "user": user}
+    if args.workspace:
+        req["workspace"] = args.workspace
+    resp = call_iam(args.api_url, args.token, req)
+
+    rec = resp.get("user", {})
+    print(f"User id: {rec.get('id', '')}", file=sys.stderr)
+    print(f"Username: {rec.get('username', '')}", file=sys.stderr)
+    print(f"Roles: {', '.join(rec.get('roles', []))}", file=sys.stderr)
+    print(rec.get("id", ""))
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="tg-create-user", description=__doc__,
+    )
+    parser.add_argument(
+        "-u", "--api-url", default=DEFAULT_URL,
+        help=f"API URL (default: {DEFAULT_URL})",
+    )
+    parser.add_argument(
+        "-t", "--token", default=DEFAULT_TOKEN,
+        help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+    )
+    parser.add_argument(
+        "--username", required=True, help="Username (unique in workspace)",
+    )
+    parser.add_argument(
+        "--password", default=None,
+        help="Password (prompted if omitted)",
+    )
+    parser.add_argument(
+        "--name", default=None, help="Display name",
+    )
+    parser.add_argument(
+        "--email", default=None, help="Email",
+    )
+    parser.add_argument(
+        "--roles", nargs="+", default=["reader"],
+        help="One or more role names (default: reader)",
+    )
+    parser.add_argument(
+        "--must-change-password", action="store_true",
+        help="Force password change on next login",
+    )
+    parser.add_argument(
+        "-w", "--workspace", default=None,
+        help=(
+            "Target workspace (admin only; defaults to caller's "
+            "assigned workspace)"
+        ),
+    )
+    run_main(do_create_user, parser)
+
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/create_workspace.py
+++ b/trustgraph-cli/trustgraph/cli/create_workspace.py
@ -0,0 +1,46 @@
+"""
+Create a workspace (system-level; requires admin).
+"""
+
+import argparse
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_create_workspace(args):
+    ws = {"id": args.workspace_id, "enabled": True}
+    if args.name:
+        ws["name"] = args.name
+
+    resp = call_iam(args.api_url, args.token, {
+        "operation": "create-workspace",
+        "workspace_record": ws,
+    })
+    rec = resp.get("workspace", {})
+    print(f"Workspace created: {rec.get('id', '')}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="tg-create-workspace", description=__doc__,
+    )
+    parser.add_argument(
+        "-u", "--api-url", default=DEFAULT_URL,
+        help=f"API URL (default: {DEFAULT_URL})",
+    )
+    parser.add_argument(
+        "-t", "--token", default=DEFAULT_TOKEN,
+        help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+    )
+    parser.add_argument(
+        "--workspace-id", required=True,
+        help="New workspace id (must not start with '_')",
+    )
+    parser.add_argument(
+        "--name", default=None, help="Display name",
+    )
+    run_main(do_create_workspace, parser)
+
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/delete_user.py
+++ b/trustgraph-cli/trustgraph/cli/delete_user.py
@ -0,0 +1,62 @@
+"""
+Delete a user.  Removes the user record, their username lookup,
+and all their API keys.  The freed username becomes available for
+re-use.
+
+Irreversible.  Use tg-disable-user if you want to preserve the
+record (audit trail, username squatting protection).
+"""
+
+import argparse
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_delete_user(args):
+    if not args.yes:
+        confirm = input(
+            f"Delete user {args.user_id}?  This is irreversible. "
+            f"[type 'yes' to confirm]: "
+        )
+        if confirm.strip() != "yes":
+            print("Aborted.")
+            return
+
+    req = {"operation": "delete-user", "user_id": args.user_id}
+    if args.workspace:
+        req["workspace"] = args.workspace
+    call_iam(args.api_url, args.token, req)
+    print(f"Deleted user {args.user_id}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="tg-delete-user", description=__doc__,
+    )
+    parser.add_argument(
+        "-u", "--api-url", default=DEFAULT_URL,
+        help=f"API URL (default: {DEFAULT_URL})",
+    )
+    parser.add_argument(
+        "-t", "--token", default=DEFAULT_TOKEN,
+        help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+    )
+    parser.add_argument(
+        "--user-id", required=True, help="User id to delete",
+    )
+    parser.add_argument(
+        "-w", "--workspace", default=None,
+        help=(
+            "Target workspace (admin only; defaults to caller's "
+            "assigned workspace)"
+        ),
+    )
+    parser.add_argument(
+        "--yes", action="store_true",
+        help="Skip the interactive confirmation prompt",
+    )
+    run_main(do_delete_user, parser)
+
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/disable_user.py
+++ b/trustgraph-cli/trustgraph/cli/disable_user.py
@ -0,0 +1,45 @@
+"""
+Disable a user.  Soft-deletes (enabled=false) and revokes all their
+API keys.
+"""
+
+import argparse
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_disable_user(args):
+    req = {"operation": "disable-user", "user_id": args.user_id}
+    if args.workspace:
+        req["workspace"] = args.workspace
+    call_iam(args.api_url, args.token, req)
+    print(f"Disabled user {args.user_id}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="tg-disable-user", description=__doc__,
+    )
+    parser.add_argument(
+        "-u", "--api-url", default=DEFAULT_URL,
+        help=f"API URL (default: {DEFAULT_URL})",
+    )
+    parser.add_argument(
+        "-t", "--token", default=DEFAULT_TOKEN,
+        help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+    )
+    parser.add_argument(
+        "--user-id", required=True, help="User id to disable",
+    )
+    parser.add_argument(
+        "-w", "--workspace", default=None,
+        help=(
+            "Target workspace (admin only; defaults to caller's "
+            "assigned workspace)"
+        ),
+    )
+    run_main(do_disable_user, parser)
+
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/enable_user.py
+++ b/trustgraph-cli/trustgraph/cli/enable_user.py
@ -0,0 +1,45 @@
+"""
+Re-enable a previously disabled user.  Does not restore their API
+keys — those must be re-issued by an admin.
+"""
+
+import argparse
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_enable_user(args):
+    req = {"operation": "enable-user", "user_id": args.user_id}
+    if args.workspace:
+        req["workspace"] = args.workspace
+    call_iam(args.api_url, args.token, req)
+    print(f"Enabled user {args.user_id}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="tg-enable-user", description=__doc__,
+    )
+    parser.add_argument(
+        "-u", "--api-url", default=DEFAULT_URL,
+        help=f"API URL (default: {DEFAULT_URL})",
+    )
+    parser.add_argument(
+        "-t", "--token", default=DEFAULT_TOKEN,
+        help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+    )
+    parser.add_argument(
+        "--user-id", required=True, help="User id to enable",
+    )
+    parser.add_argument(
+        "-w", "--workspace", default=None,
+        help=(
+            "Target workspace (admin only; defaults to caller's "
+            "assigned workspace)"
+        ),
+    )
+    run_main(do_enable_user, parser)
+
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/init_trustgraph.py
+++ b/trustgraph-cli/trustgraph/cli/init_trustgraph.py
@ -1,271 +0,0 @@
-"""
-Initialises TrustGraph pub/sub infrastructure and pushes initial config.
-
-For Pulsar: creates tenant, namespaces, and retention policies.
-For RabbitMQ: queues are auto-declared, so only config push is needed.
-"""
-
-import requests
-import time
-import argparse
-import json
-
-from trustgraph.clients.config_client import ConfigClient
-from trustgraph.base.pubsub import add_pubsub_args
-
-default_pulsar_admin_url = "http://pulsar:8080"
-subscriber = "tg-init-pubsub"
-
-
-def get_clusters(url):
-
-    print("Get clusters...", flush=True)
-
-    resp = requests.get(f"{url}/admin/v2/clusters")
-
-    if resp.status_code != 200: raise RuntimeError("Could not fetch clusters")
-
-    return resp.json()
-
-def ensure_tenant(url, tenant, clusters):
-
-    resp = requests.get(f"{url}/admin/v2/tenants/{tenant}")
-
-    if resp.status_code == 200:
-        print(f"Tenant {tenant} already exists.", flush=True)
-        return
-
-    resp = requests.put(
-        f"{url}/admin/v2/tenants/{tenant}",
-        json={
-            "adminRoles": [],
-            "allowedClusters": clusters,
-        }
-    )
-
-    if resp.status_code != 204:
-        print(resp.text, flush=True)
-        raise RuntimeError("Tenant creation failed.")
-
-    print(f"Tenant {tenant} created.", flush=True)
-
-def ensure_namespace(url, tenant, namespace, config):
-
-    resp = requests.get(f"{url}/admin/v2/namespaces/{tenant}/{namespace}")
-
-    if resp.status_code == 200:
-        print(f"Namespace {tenant}/{namespace} already exists.", flush=True)
-        return
-
-    resp = requests.put(
-        f"{url}/admin/v2/namespaces/{tenant}/{namespace}",
-        json=config,
-    )
-
-    if resp.status_code != 204:
-        print(resp.status_code, flush=True)
-        print(resp.text, flush=True)
-        raise RuntimeError(f"Namespace {tenant}/{namespace} creation failed.")
-
-    print(f"Namespace {tenant}/{namespace} created.", flush=True)
-
-def ensure_config(config, workspace="default", **pubsub_config):
-
-    cli = ConfigClient(
-        subscriber=subscriber,
-        workspace=workspace,
-        **pubsub_config,
-    )
-
-    while True:
-
-        try:
-
-            print("Get current config...", flush=True)
-            current, version = cli.config(timeout=5)
-
-        except Exception as e:
-
-            print("Exception:", e, flush=True)
-            time.sleep(2)
-            print("Retrying...", flush=True)
-            continue
-
-        print("Current config version is", version, flush=True)
-
-        if version != 0:
-            print("Already updated, not updating config.  Done.", flush=True)
-            return
-
-        print("Config is version 0, updating...", flush=True)
-
-        batch = []
-
-        for type in config:
-            for key in config[type]:
-                print(f"Adding {type}/{key} to update.", flush=True)
-                batch.append({
-                    "type": type,
-                    "key": key,
-                    "value": json.dumps(config[type][key]),
-                })
-
-        try:
-            cli.put(batch, timeout=10)
-            print("Update succeeded.", flush=True)
-            break
-        except Exception as e:
-            print("Exception:", e, flush=True)
-            time.sleep(2)
-            print("Retrying...", flush=True)
-            continue
-
-def init_pulsar(pulsar_admin_url, tenant):
-    """Pulsar-specific setup: create tenant, namespaces, retention policies."""
-
-    clusters = get_clusters(pulsar_admin_url)
-
-    ensure_tenant(pulsar_admin_url, tenant, clusters)
-
-    ensure_namespace(pulsar_admin_url, tenant, "flow", {})
-
-    ensure_namespace(pulsar_admin_url, tenant, "request", {})
-
-    ensure_namespace(pulsar_admin_url, tenant, "response", {
-        "retention_policies": {
-            "retentionSizeInMB": -1,
-            "retentionTimeInMinutes": 3,
-            "subscriptionExpirationTimeMinutes": 30,
-        }
-    })
-
-    ensure_namespace(pulsar_admin_url, tenant, "notify", {
-        "retention_policies": {
-            "retentionSizeInMB": -1,
-            "retentionTimeInMinutes": 3,
-            "subscriptionExpirationTimeMinutes": 5,
-        }
-    })
-
-
-def push_config(config_json, config_file, workspace="default",
-                **pubsub_config):
-    """Push initial config if provided."""
-
-    if config_json is not None:
-
-        try:
-            print("Decoding config...", flush=True)
-            dec = json.loads(config_json)
-            print("Decoded.", flush=True)
-        except Exception as e:
-            print("Exception:", e, flush=True)
-            raise e
-
-        ensure_config(dec, workspace=workspace, **pubsub_config)
-
-    elif config_file is not None:
-
-        try:
-            print("Decoding config...", flush=True)
-            dec = json.load(open(config_file))
-            print("Decoded.", flush=True)
-        except Exception as e:
-            print("Exception:", e, flush=True)
-            raise e
-
-        ensure_config(dec, workspace=workspace, **pubsub_config)
-
-    else:
-        print("No config to update.", flush=True)
-
-
-def main():
-
-    parser = argparse.ArgumentParser(
-        prog='tg-init-trustgraph',
-        description=__doc__,
-    )
-
-    parser.add_argument(
-        '--pulsar-admin-url',
-        default=default_pulsar_admin_url,
-        help=f'Pulsar admin URL (default: {default_pulsar_admin_url})',
-    )
-
-    parser.add_argument(
-        '-c', '--config',
-        help=f'Initial configuration to load',
-    )
-
-    parser.add_argument(
-        '-C', '--config-file',
-        help=f'Initial configuration to load from file',
-    )
-
-    parser.add_argument(
-        '-t', '--tenant',
-        default="tg",
-        help=f'Tenant (default: tg)',
-    )
-
-    parser.add_argument(
-        '-w', '--workspace',
-        default="default",
-        help=f'Workspace (default: default)',
-    )
-
-    add_pubsub_args(parser)
-
-    args = parser.parse_args()
-
-    backend_type = args.pubsub_backend
-
-    # Extract pubsub config from args
-    pubsub_config = {
-        k: v for k, v in vars(args).items()
-        if k not in (
-            'pulsar_admin_url', 'config', 'config_file', 'tenant',
-            'workspace',
-        )
-    }
-
-    while True:
-
-        try:
-
-            # Pulsar-specific setup (tenants, namespaces)
-            if backend_type == 'pulsar':
-                print(flush=True)
-                print(
-                    f"Initialising Pulsar at {args.pulsar_admin_url}...",
-                    flush=True,
-                )
-                init_pulsar(args.pulsar_admin_url, args.tenant)
-            else:
-                print(flush=True)
-                print(
-                    f"Using {backend_type} backend (no admin setup needed).",
-                    flush=True,
-                )
-
-            # Push config (works with any backend)
-            push_config(
-                args.config, args.config_file,
-                workspace=args.workspace,
-                **pubsub_config,
-            )
-
-            print("Initialisation complete.", flush=True)
-            break
-
-        except Exception as e:
-
-            print("Exception:", e, flush=True)
-
-            print("Sleeping...", flush=True)
-            time.sleep(2)
-            print("Will retry...", flush=True)
-
-if __name__ == "__main__":
-    main()
--- a/trustgraph-cli/trustgraph/cli/list_api_keys.py
+++ b/trustgraph-cli/trustgraph/cli/list_api_keys.py
@ -0,0 +1,69 @@
+"""
+List the API keys for a user.
+"""
+
+import argparse
+
+import tabulate
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_list_api_keys(args):
+    req = {"operation": "list-api-keys", "user_id": args.user_id}
+    if args.workspace:
+        req["workspace"] = args.workspace
+    resp = call_iam(args.api_url, args.token, req)
+
+    keys = resp.get("api_keys", [])
+    if not keys:
+        print("No keys.")
+        return
+
+    rows = [
+        [
+            k.get("id", ""),
+            k.get("name", ""),
+            k.get("prefix", ""),
+            k.get("created", ""),
+            k.get("last_used", "") or "—",
+            k.get("expires", "") or "never",
+        ]
+        for k in keys
+    ]
+    print(tabulate.tabulate(
+        rows,
+        headers=["id", "name", "prefix", "created", "last used", "expires"],
+        tablefmt="pretty",
+        stralign="left",
+    ))
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="tg-list-api-keys", description=__doc__,
+    )
+    parser.add_argument(
+        "-u", "--api-url", default=DEFAULT_URL,
+        help=f"API URL (default: {DEFAULT_URL})",
+    )
+    parser.add_argument(
+        "-t", "--token", default=DEFAULT_TOKEN,
+        help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+    )
+    parser.add_argument(
+        "--user-id", required=True,
+        help="Owner user id",
+    )
+    parser.add_argument(
+        "-w", "--workspace", default=None,
+        help=(
+            "Target workspace (admin only; defaults to caller's "
+            "assigned workspace)"
+        ),
+    )
+    run_main(do_list_api_keys, parser)
+
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/list_users.py
+++ b/trustgraph-cli/trustgraph/cli/list_users.py
@ -0,0 +1,65 @@
+"""
+List users in the caller's workspace.
+"""
+
+import argparse
+
+import tabulate
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_list_users(args):
+    req = {"operation": "list-users"}
+    if args.workspace:
+        req["workspace"] = args.workspace
+    resp = call_iam(args.api_url, args.token, req)
+
+    users = resp.get("users", [])
+    if not users:
+        print("No users.")
+        return
+
+    rows = [
+        [
+            u.get("id", ""),
+            u.get("username", ""),
+            u.get("name", ""),
+            ", ".join(u.get("roles", [])),
+            "yes" if u.get("enabled") else "no",
+            "yes" if u.get("must_change_password") else "no",
+        ]
+        for u in users
+    ]
+    print(tabulate.tabulate(
+        rows,
+        headers=["id", "username", "name", "roles", "enabled", "change-pw"],
+        tablefmt="pretty",
+        stralign="left",
+    ))
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="tg-list-users", description=__doc__,
+    )
+    parser.add_argument(
+        "-u", "--api-url", default=DEFAULT_URL,
+        help=f"API URL (default: {DEFAULT_URL})",
+    )
+    parser.add_argument(
+        "-t", "--token", default=DEFAULT_TOKEN,
+        help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+    )
+    parser.add_argument(
+        "-w", "--workspace", default=None,
+        help=(
+            "Target workspace (admin only; defaults to caller's "
+            "assigned workspace)"
+        ),
+    )
+    run_main(do_list_users, parser)
+
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/list_workspaces.py
+++ b/trustgraph-cli/trustgraph/cli/list_workspaces.py
@ -0,0 +1,53 @@
+"""
+List workspaces (system-level; requires admin).
+"""
+
+import argparse
+
+import tabulate
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_list_workspaces(args):
+    resp = call_iam(
+        args.api_url, args.token, {"operation": "list-workspaces"},
+    )
+    workspaces = resp.get("workspaces", [])
+    if not workspaces:
+        print("No workspaces.")
+        return
+    rows = [
+        [
+            w.get("id", ""),
+            w.get("name", ""),
+            "yes" if w.get("enabled") else "no",
+            w.get("created", ""),
+        ]
+        for w in workspaces
+    ]
+    print(tabulate.tabulate(
+        rows,
+        headers=["id", "name", "enabled", "created"],
+        tablefmt="pretty",
+        stralign="left",
+    ))
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="tg-list-workspaces", description=__doc__,
+    )
+    parser.add_argument(
+        "-u", "--api-url", default=DEFAULT_URL,
+        help=f"API URL (default: {DEFAULT_URL})",
+    )
+    parser.add_argument(
+        "-t", "--token", default=DEFAULT_TOKEN,
+        help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+    )
+    run_main(do_list_workspaces, parser)
+
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/login.py
+++ b/trustgraph-cli/trustgraph/cli/login.py
@ -0,0 +1,62 @@
+"""
+Log in with username / password.  Prints the resulting JWT to
+stdout so it can be captured for subsequent CLI use.
+"""
+
+import argparse
+import getpass
+import sys
+
+from ._iam import DEFAULT_URL, call_auth, run_main
+
+
+def do_login(args):
+    password = args.password
+    if not password:
+        password = getpass.getpass(f"Password for {args.username}: ")
+
+    body = {
+        "username": args.username,
+        "password": password,
+    }
+    if args.workspace:
+        body["workspace"] = args.workspace
+
+    resp = call_auth(args.api_url, "/api/v1/auth/login", None, body)
+
+    jwt = resp.get("jwt", "")
+    expires = resp.get("jwt_expires", "")
+
+    if expires:
+        print(f"JWT expires: {expires}", file=sys.stderr)
+    # Machine-readable on stdout.
+    print(jwt)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="tg-login", description=__doc__,
+    )
+    parser.add_argument(
+        "-u", "--api-url", default=DEFAULT_URL,
+        help=f"API URL (default: {DEFAULT_URL})",
+    )
+    parser.add_argument(
+        "--username", required=True, help="Username",
+    )
+    parser.add_argument(
+        "--password", default=None,
+        help="Password (prompted if omitted)",
+    )
+    parser.add_argument(
+        "-w", "--workspace", default=None,
+        help=(
+            "Optional workspace to log in against.  Defaults to "
+            "the user's assigned workspace."
+        ),
+    )
+    run_main(do_login, parser)
+
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/reset_password.py
+++ b/trustgraph-cli/trustgraph/cli/reset_password.py
@ -0,0 +1,54 @@
+"""
+Admin: reset another user's password.  Prints a one-time temporary
+password to stdout.  The user is forced to change it on next login.
+"""
+
+import argparse
+import sys
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_reset_password(args):
+    req = {"operation": "reset-password", "user_id": args.user_id}
+    if args.workspace:
+        req["workspace"] = args.workspace
+    resp = call_iam(args.api_url, args.token, req)
+
+    tmp = resp.get("temporary_password", "")
+    if not tmp:
+        raise RuntimeError(
+            "IAM returned no temporary password — unexpected"
+        )
+    print("Temporary password (shown once, capture now):", file=sys.stderr)
+    print(tmp)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="tg-reset-password", description=__doc__,
+    )
+    parser.add_argument(
+        "-u", "--api-url", default=DEFAULT_URL,
+        help=f"API URL (default: {DEFAULT_URL})",
+    )
+    parser.add_argument(
+        "-t", "--token", default=DEFAULT_TOKEN,
+        help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+    )
+    parser.add_argument(
+        "--user-id", required=True,
+        help="Target user id",
+    )
+    parser.add_argument(
+        "-w", "--workspace", default=None,
+        help=(
+            "Target workspace (admin only; defaults to caller's "
+            "assigned workspace)"
+        ),
+    )
+    run_main(do_reset_password, parser)
+
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/revoke_api_key.py
+++ b/trustgraph-cli/trustgraph/cli/revoke_api_key.py
@ -0,0 +1,44 @@
+"""
+Revoke an API key by id.
+"""
+
+import argparse
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_revoke_api_key(args):
+    req = {"operation": "revoke-api-key", "key_id": args.key_id}
+    if args.workspace:
+        req["workspace"] = args.workspace
+    call_iam(args.api_url, args.token, req)
+    print(f"Revoked key {args.key_id}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="tg-revoke-api-key", description=__doc__,
+    )
+    parser.add_argument(
+        "-u", "--api-url", default=DEFAULT_URL,
+        help=f"API URL (default: {DEFAULT_URL})",
+    )
+    parser.add_argument(
+        "-t", "--token", default=DEFAULT_TOKEN,
+        help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+    )
+    parser.add_argument(
+        "--key-id", required=True, help="Key id to revoke",
+    )
+    parser.add_argument(
+        "-w", "--workspace", default=None,
+        help=(
+            "Target workspace (admin only; defaults to caller's "
+            "assigned workspace)"
+        ),
+    )
+    run_main(do_revoke_api_key, parser)
+
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/show_flow_state.py
+++ b/trustgraph-cli/trustgraph/cli/show_flow_state.py
@ -44,16 +44,18 @@ def show_processors(metrics_url, flow_label):

    obj = resp.json()

-    tbl = [
-        [
-            m["metric"]["job"],
-            "\U0001f49a" if int(m["value"][1]) > 0 else "\U0000274c"
-        ]
-        for m in obj["data"]["result"]
-    ]
+    # consumer_state is one sample per consumer (queue); a processor
+    # with N subscriptions shows up N times.  Aggregate to one row per
+    # processor: green only if every consumer is running.
+    by_proc = {}
+    for m in obj["data"]["result"]:
+        name = m["metric"].get("processor", m["metric"]["job"])
+        running = int(m["value"][1]) > 0
+        by_proc[name] = by_proc.get(name, True) and running

-    for row in tbl:
-        print(f"- {row[0]:30} {row[1]}")
+    for name in sorted(by_proc):
+        icon = "\U0001f49a" if by_proc[name] else "\U0000274c"
+        print(f"- {name:30} {icon}")

 def main():

--- a/trustgraph-cli/trustgraph/cli/show_processor_state.py
+++ b/trustgraph-cli/trustgraph/cli/show_processor_state.py
@ -17,7 +17,7 @@ def dump_status(url):

    tbl = [
        [
-            m["metric"]["job"],
+            m["metric"].get("processor", m["metric"]["job"]),
            "\U0001f49a"
        ]
        for m in obj["data"]["result"]
--- a/trustgraph-cli/trustgraph/cli/update_user.py
+++ b/trustgraph-cli/trustgraph/cli/update_user.py
@ -0,0 +1,125 @@
+"""
+Update a user's profile fields: name, email, roles, enabled flag,
+must-change-password flag.
+
+Username is immutable — create a new user and disable the old one
+to effect a username change.  Password changes go through
+``tg-change-password`` (self-service) or ``tg-reset-password``
+(admin-driven).
+
+Only the fields you supply are changed; omitted fields are left
+untouched on the user record.  An empty ``--roles`` is rejected by
+iam-svc (a user must have at least one role); to demote a user use
+``tg-disable-user``.
+"""
+
+import argparse
+import sys
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def _parse_bool(s):
+    if s is None:
+        return None
+    s = s.strip().lower()
+    if s in ("yes", "y", "true", "t", "1"):
+        return True
+    if s in ("no", "n", "false", "f", "0"):
+        return False
+    raise argparse.ArgumentTypeError(
+        f"expected yes/no, got {s!r}"
+    )
+
+
+def do_update_user(args):
+    user = {}
+    if args.name is not None:
+        user["name"] = args.name
+    if args.email is not None:
+        user["email"] = args.email
+    if args.roles is not None:
+        user["roles"] = args.roles
+    if args.enabled is not None:
+        user["enabled"] = args.enabled
+    if args.must_change_password is not None:
+        user["must_change_password"] = args.must_change_password
+
+    if not user:
+        print(
+            "tg-update-user: nothing to change — supply at least "
+            "one of --name / --email / --roles / --enabled / "
+            "--must-change-password",
+            file=sys.stderr,
+        )
+        sys.exit(2)
+
+    req = {
+        "operation": "update-user",
+        "user_id": args.user_id,
+        "user": user,
+    }
+    if args.workspace:
+        req["workspace"] = args.workspace
+    resp = call_iam(args.api_url, args.token, req)
+
+    rec = resp.get("user", {})
+    print(f"id        : {rec.get('id', '')}")
+    print(f"username  : {rec.get('username', '')}")
+    print(f"name      : {rec.get('name', '')}")
+    print(f"email     : {rec.get('email', '')}")
+    print(f"workspace : {rec.get('workspace', '')}")
+    print(f"roles     : {', '.join(rec.get('roles', []))}")
+    print(f"enabled   : {'yes' if rec.get('enabled') else 'no'}")
+    print(
+        f"must-change-pw: "
+        f"{'yes' if rec.get('must_change_password') else 'no'}"
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="tg-update-user", description=__doc__,
+    )
+    parser.add_argument(
+        "-u", "--api-url", default=DEFAULT_URL,
+        help=f"API URL (default: {DEFAULT_URL})",
+    )
+    parser.add_argument(
+        "-t", "--token", default=DEFAULT_TOKEN,
+        help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+    )
+    parser.add_argument(
+        "--user-id", required=True, help="Target user id",
+    )
+    parser.add_argument(
+        "--name", default=None, help="New display name",
+    )
+    parser.add_argument(
+        "--email", default=None, help="New email",
+    )
+    parser.add_argument(
+        "--roles", nargs="+", default=None,
+        help="Replacement role list (e.g. --roles reader writer)",
+    )
+    parser.add_argument(
+        "--enabled", type=_parse_bool, default=None,
+        help="Set enabled flag (yes/no)",
+    )
+    parser.add_argument(
+        "--must-change-password", type=_parse_bool, default=None,
+        help="Set must-change-password flag (yes/no)",
+    )
+    parser.add_argument(
+        "-w", "--workspace", default=None,
+        help=(
+            "Optional workspace integrity check — when supplied, "
+            "iam-svc verifies the target user's home workspace "
+            "matches"
+        ),
+    )
+    run_main(do_update_user, parser)
+
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/whoami.py
+++ b/trustgraph-cli/trustgraph/cli/whoami.py
@ -0,0 +1,52 @@
+"""
+Show the authenticated caller's own user record.
+"""
+
+import argparse
+
+import tabulate
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_whoami(args):
+    resp = call_iam(args.api_url, args.token, {"operation": "whoami"})
+    user = resp.get("user")
+    if not user:
+        print("(no user record returned)")
+        return
+
+    rows = [
+        ["id", user.get("id", "")],
+        ["username", user.get("username", "")],
+        ["name", user.get("name", "")],
+        ["email", user.get("email", "")],
+        ["workspace", user.get("workspace", "")],
+        ["roles", ", ".join(user.get("roles", []))],
+        ["enabled", "yes" if user.get("enabled") else "no"],
+        [
+            "must change password",
+            "yes" if user.get("must_change_password") else "no",
+        ],
+        ["created", user.get("created", "")],
+    ]
+    print(tabulate.tabulate(rows, tablefmt="plain"))
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="tg-whoami", description=__doc__,
+    )
+    parser.add_argument(
+        "-u", "--api-url", default=DEFAULT_URL,
+        help=f"API URL (default: {DEFAULT_URL})",
+    )
+    parser.add_argument(
+        "-t", "--token", default=DEFAULT_TOKEN,
+        help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+    )
+    run_main(do_whoami, parser)
+
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-flow/pyproject.toml
+++ b/trustgraph-flow/pyproject.toml
@ -60,8 +60,10 @@ agent-orchestrator = "trustgraph.agent.orchestrator:run"
 api-gateway = "trustgraph.gateway:run"
 chunker-recursive = "trustgraph.chunking.recursive:run"
 chunker-token = "trustgraph.chunking.token:run"
+bootstrap = "trustgraph.bootstrap.bootstrapper:run"
 config-svc = "trustgraph.config.service:run"
 flow-svc = "trustgraph.flow.service:run"
+iam-svc = "trustgraph.iam.service:run"
 doc-embeddings-query-milvus = "trustgraph.query.doc_embeddings.milvus:run"
 doc-embeddings-query-pinecone = "trustgraph.query.doc_embeddings.pinecone:run"
 doc-embeddings-query-qdrant = "trustgraph.query.doc_embeddings.qdrant:run"
--- a/trustgraph-flow/trustgraph/bootstrap/init.py
+++ b/trustgraph-flow/trustgraph/bootstrap/init.py
--- a/trustgraph-flow/trustgraph/bootstrap/base.py
+++ b/trustgraph-flow/trustgraph/bootstrap/base.py
@ -0,0 +1,68 @@
+"""
+Bootstrap framework: Initialiser base class and per-wake context.
+
+See docs/tech-specs/bootstrap.md for the full design.
+"""
+
+import logging
+from dataclasses import dataclass
+from typing import Any
+
+
+@dataclass
+class InitContext:
+    """Shared per-wake context passed to each initialiser.
+
+    The bootstrapper constructs one of these on every wake cycle,
+    tears it down at cycle end, and passes it into each initialiser's
+    ``run()`` method.  Fields are short-lived and safe to use during
+    a single cycle only.
+    """
+
+    logger: logging.Logger
+    config: Any    # ConfigClient
+    flow: Any      # RequestResponse client for flow-svc
+
+
+class Initialiser:
+    """Base class for bootstrap initialisers.
+
+    Subclasses implement :meth:`run`.  The bootstrapper manages
+    completion state, flag comparison, retry and error handling —
+    subclasses describe only the work to perform.
+
+    Class attributes:
+
+    * ``wait_for_services`` (bool, default ``True``): when ``True`` the
+      initialiser only runs after the bootstrapper's service gate has
+      passed (config-svc and flow-svc reachable).  Set ``False`` for
+      initialisers that bring up infrastructure the gate itself
+      depends on — principally Pulsar topology, without which
+      config-svc cannot come online.
+    """
+
+    wait_for_services: bool = True
+
+    def __init__(self, **params):
+        # Subclasses should consume their own params via keyword
+        # arguments in their own __init__ signatures.  This catch-all
+        # is here so any kwargs that filter through unnoticed don't
+        # raise TypeError on construction.
+        pass
+
+    async def run(self, ctx, old_flag, new_flag):
+        """Perform initialisation work.
+
+        :param ctx: :class:`InitContext` with logger, config client,
+            flow-svc client.
+        :param old_flag: Previously-stored flag string, or ``None`` if
+            this initialiser has never successfully completed in this
+            deployment.
+        :param new_flag: Currently-configured flag.  A string chosen
+            by the operator; typically something like ``"v1"``.
+
+        :raises: Any exception on failure.  The bootstrapper catches,
+            logs, and re-runs on the next cycle; completion state is
+            only written on clean return.
+        """
+        raise NotImplementedError
--- a/trustgraph-flow/trustgraph/bootstrap/bootstrapper/init.py
+++ b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/init.py
@ -0,0 +1 @@
+from . service import *
--- a/trustgraph-flow/trustgraph/bootstrap/bootstrapper/main.py
+++ b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/main.py
@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+
+from . service import run
+
+if __name__ == '__main__':
+    run()
--- a/trustgraph-flow/trustgraph/bootstrap/bootstrapper/service.py
+++ b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/service.py
@ -0,0 +1,414 @@
+"""
+Bootstrapper processor.
+
+Runs a pluggable list of initialisers in a reconciliation loop.
+Each initialiser's completion state is recorded in the reserved
+``__system__`` workspace under the ``init-state`` config type.
+
+See docs/tech-specs/bootstrap.md for the full design.
+"""
+
+import asyncio
+import importlib
+import json
+import logging
+import uuid
+from argparse import ArgumentParser
+from dataclasses import dataclass
+
+from trustgraph.base import AsyncProcessor
+from trustgraph.base import ProducerMetrics, SubscriberMetrics
+from trustgraph.base.config_client import ConfigClient
+from trustgraph.base.request_response_spec import RequestResponse
+from trustgraph.schema import (
+    ConfigRequest, ConfigResponse,
+    config_request_queue, config_response_queue,
+)
+from trustgraph.schema import (
+    FlowRequest, FlowResponse,
+    flow_request_queue, flow_response_queue,
+)
+
+from .. base import Initialiser, InitContext
+
+logger = logging.getLogger(__name__)
+
+default_ident = "bootstrap"
+
+# Reserved workspace + config type under which completion state is
+# stored.  Reserved (`_`-prefix) workspaces are excluded from the
+# config push broadcast — live processors never see these keys.
+SYSTEM_WORKSPACE = "__system__"
+INIT_STATE_TYPE = "init-state"
+
+# Cadence tiers.
+GATE_BACKOFF = 5           # Services not responding; retry soon.
+INIT_RETRY = 15            # Gate passed but something ran/failed;
+                           # converge quickly.
+STEADY_INTERVAL = 300      # Everything at target flag; idle cheaply.
+
+
+@dataclass
+class InitialiserSpec:
+    """One entry in the bootstrapper's configured list of initialisers."""
+    name: str
+    flag: str
+    instance: Initialiser
+
+
+def _resolve_class(dotted):
+    """Import and return a class by its dotted path."""
+    module_path, _, class_name = dotted.rpartition(".")
+    if not module_path:
+        raise ValueError(
+            f"Initialiser class must be a dotted path, got {dotted!r}"
+        )
+    module = importlib.import_module(module_path)
+    return getattr(module, class_name)
+
+
+def _load_initialisers_file(path):
+    """Load the initialisers spec list from a YAML or JSON file.
+
+    File shape:
+
+    .. code-block:: yaml
+
+        initialisers:
+          - class: trustgraph.bootstrap.initialisers.PulsarTopology
+            name: pulsar-topology
+            flag: v1
+            params:
+              admin_url: http://pulsar:8080
+              tenant: tg
+          - ...
+    """
+    with open(path) as f:
+        content = f.read()
+    if path.endswith((".yaml", ".yml")):
+        import yaml
+        doc = yaml.safe_load(content)
+    else:
+        doc = json.loads(content)
+    if not isinstance(doc, dict) or "initialisers" not in doc:
+        raise RuntimeError(
+            f"{path}: expected a mapping with an 'initialisers' key"
+        )
+    return doc["initialisers"]
+
+
+class Processor(AsyncProcessor):
+
+    def __init__(self, **params):
+
+        super().__init__(**params)
+
+        # Source the initialisers list either from a direct parameter
+        # (processor-group embedding) or from a file (CLI launch).
+        inits = params.get("initialisers")
+        if inits is None:
+            inits_file = params.get("initialisers_file")
+            if inits_file is None:
+                raise RuntimeError(
+                    "Bootstrapper requires either the 'initialisers' "
+                    "parameter or --initialisers-file"
+                )
+            inits = _load_initialisers_file(inits_file)
+
+        self.specs = []
+        names = set()
+
+        for entry in inits:
+            if not isinstance(entry, dict):
+                raise RuntimeError(
+                    f"Initialiser entry must be a mapping, got: {entry!r}"
+                )
+            for required in ("class", "name", "flag"):
+                if required not in entry:
+                    raise RuntimeError(
+                        f"Initialiser entry missing required field "
+                        f"{required!r}: {entry!r}"
+                    )
+
+            name = entry["name"]
+            if name in names:
+                raise RuntimeError(f"Duplicate initialiser name {name!r}")
+            names.add(name)
+
+            cls = _resolve_class(entry["class"])
+
+            try:
+                instance = cls(**entry.get("params", {}))
+            except Exception as e:
+                raise RuntimeError(
+                    f"Failed to instantiate initialiser "
+                    f"{entry['class']!r} as {name!r}: "
+                    f"{type(e).__name__}: {e}"
+                )
+
+            self.specs.append(InitialiserSpec(
+                name=name,
+                flag=entry["flag"],
+                instance=instance,
+            ))
+
+        logger.info(
+            f"Bootstrapper: loaded {len(self.specs)} initialisers"
+        )
+
+    # ------------------------------------------------------------------
+    # Client construction (short-lived per wake cycle).
+    # ------------------------------------------------------------------
+
+    def _make_config_client(self):
+        rr_id = str(uuid.uuid4())
+        return ConfigClient(
+            backend=self.pubsub_backend,
+            subscription=f"{self.id}--config--{rr_id}",
+            consumer_name=self.id,
+            request_topic=config_request_queue,
+            request_schema=ConfigRequest,
+            request_metrics=ProducerMetrics(
+                processor=self.id, flow=None, name="config-request",
+            ),
+            response_topic=config_response_queue,
+            response_schema=ConfigResponse,
+            response_metrics=SubscriberMetrics(
+                processor=self.id, flow=None, name="config-response",
+            ),
+        )
+
+    def _make_flow_client(self):
+        rr_id = str(uuid.uuid4())
+        return RequestResponse(
+            backend=self.pubsub_backend,
+            subscription=f"{self.id}--flow--{rr_id}",
+            consumer_name=self.id,
+            request_topic=flow_request_queue,
+            request_schema=FlowRequest,
+            request_metrics=ProducerMetrics(
+                processor=self.id, flow=None, name="flow-request",
+            ),
+            response_topic=flow_response_queue,
+            response_schema=FlowResponse,
+            response_metrics=SubscriberMetrics(
+                processor=self.id, flow=None, name="flow-response",
+            ),
+        )
+
+    async def _open_clients(self):
+        config = self._make_config_client()
+        flow = self._make_flow_client()
+        await config.start()
+        try:
+            await flow.start()
+        except Exception:
+            await self._safe_stop(config)
+            raise
+        return config, flow
+
+    async def _safe_stop(self, client):
+        try:
+            await client.stop()
+        except Exception:
+            pass
+
+    # ------------------------------------------------------------------
+    # Service gate.
+    # ------------------------------------------------------------------
+
+    async def _gate_ready(self, config, flow):
+        try:
+            await config.keys(SYSTEM_WORKSPACE, INIT_STATE_TYPE)
+        except Exception as e:
+            logger.info(
+                f"Gate: config-svc not ready ({type(e).__name__}: {e})"
+            )
+            return False
+
+        try:
+            resp = await flow.request(
+                FlowRequest(
+                    operation="list-blueprints",
+                    workspace=SYSTEM_WORKSPACE,
+                ),
+                timeout=5,
+            )
+            if resp.error:
+                logger.info(
+                    f"Gate: flow-svc error: "
+                    f"{resp.error.type}: {resp.error.message}"
+                )
+                return False
+        except Exception as e:
+            logger.info(
+                f"Gate: flow-svc not ready ({type(e).__name__}: {e})"
+            )
+            return False
+
+        return True
+
+    # ------------------------------------------------------------------
+    # Completion state.
+    # ------------------------------------------------------------------
+
+    async def _stored_flag(self, config, name):
+        raw = await config.get(SYSTEM_WORKSPACE, INIT_STATE_TYPE, name)
+        if raw is None:
+            return None
+        try:
+            return json.loads(raw)
+        except Exception:
+            return raw
+
+    async def _store_flag(self, config, name, flag):
+        await config.put(
+            SYSTEM_WORKSPACE, INIT_STATE_TYPE, name,
+            json.dumps(flag),
+        )
+
+    # ------------------------------------------------------------------
+    # Per-spec execution.
+    # ------------------------------------------------------------------
+
+    async def _run_spec(self, spec, config, flow):
+        """Run a single initialiser spec.
+
+        Returns one of:
+          - ``"skip"``: stored flag already matches target, nothing to do.
+          - ``"ran"``: initialiser ran and completion state was updated.
+          - ``"failed"``: initialiser raised.
+          - ``"failed-state-write"``: initialiser succeeded but we could
+            not persist the new flag (transient — will re-run next cycle).
+        """
+
+        try:
+            old_flag = await self._stored_flag(config, spec.name)
+        except Exception as e:
+            logger.warning(
+                f"{spec.name}: could not read stored flag "
+                f"({type(e).__name__}: {e})"
+            )
+            return "failed"
+
+        if old_flag == spec.flag:
+            return "skip"
+
+        child_logger = logger.getChild(spec.name)
+        child_ctx = InitContext(
+            logger=child_logger,
+            config=config,
+            flow=flow,
+        )
+
+        child_logger.info(
+            f"Running (old_flag={old_flag!r} -> new_flag={spec.flag!r})"
+        )
+
+        try:
+            await spec.instance.run(child_ctx, old_flag, spec.flag)
+        except Exception as e:
+            child_logger.error(
+                f"Failed: {type(e).__name__}: {e}", exc_info=True,
+            )
+            return "failed"
+
+        try:
+            await self._store_flag(config, spec.name, spec.flag)
+        except Exception as e:
+            child_logger.warning(
+                f"Completed but could not persist state flag "
+                f"({type(e).__name__}: {e}); will re-run next cycle"
+            )
+            return "failed-state-write"
+
+        child_logger.info(f"Completed (flag={spec.flag!r})")
+        return "ran"
+
+    # ------------------------------------------------------------------
+    # Main loop.
+    # ------------------------------------------------------------------
+
+    async def run(self):
+
+        logger.info(
+            f"Bootstrapper starting with {len(self.specs)} initialisers"
+        )
+
+        while self.running:
+
+            sleep_for = STEADY_INTERVAL
+
+            try:
+                config, flow = await self._open_clients()
+            except Exception as e:
+                logger.info(
+                    f"Failed to open clients "
+                    f"({type(e).__name__}: {e}); retry in {GATE_BACKOFF}s"
+                )
+                await asyncio.sleep(GATE_BACKOFF)
+                continue
+
+            try:
+                # Phase 1: pre-service initialisers run unconditionally.
+                pre_specs = [
+                    s for s in self.specs
+                    if not s.instance.wait_for_services
+                ]
+                pre_results = {}
+                for spec in pre_specs:
+                    pre_results[spec.name] = await self._run_spec(
+                        spec, config, flow,
+                    )
+
+                # Phase 2: gate.
+                gate_ok = await self._gate_ready(config, flow)
+
+                # Phase 3: post-service initialisers, if gate passed.
+                post_results = {}
+                if gate_ok:
+                    post_specs = [
+                        s for s in self.specs
+                        if s.instance.wait_for_services
+                    ]
+                    for spec in post_specs:
+                        post_results[spec.name] = await self._run_spec(
+                            spec, config, flow,
+                        )
+
+                # Cadence selection.
+                if not gate_ok:
+                    sleep_for = GATE_BACKOFF
+                else:
+                    all_results = {**pre_results, **post_results}
+                    if any(r != "skip" for r in all_results.values()):
+                        sleep_for = INIT_RETRY
+                    else:
+                        sleep_for = STEADY_INTERVAL
+
+            finally:
+                await self._safe_stop(config)
+                await self._safe_stop(flow)
+
+            await asyncio.sleep(sleep_for)
+
+    # ------------------------------------------------------------------
+    # CLI arg plumbing.
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def add_args(parser: ArgumentParser) -> None:
+
+        AsyncProcessor.add_args(parser)
+
+        parser.add_argument(
+            '-c', '--initialisers-file',
+            help='Path to YAML or JSON file describing the '
+                 'initialisers to run.  Ignored when the '
+                 "'initialisers' parameter is provided directly "
+                 '(e.g. when running inside a processor group).',
+        )
+
+
+def run():
+    Processor.launch(default_ident, __doc__)
--- a/trustgraph-flow/trustgraph/bootstrap/initialisers/init.py
+++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/init.py
@ -0,0 +1,20 @@
+"""
+Core bootstrap initialisers.
+
+These cover the base TrustGraph deployment case.  Enterprise or
+third-party initialisers live in their own packages and are
+referenced in the bootstrapper's config by fully-qualified dotted
+path.
+"""
+
+from . pulsar_topology import PulsarTopology
+from . template_seed import TemplateSeed
+from . workspace_init import WorkspaceInit
+from . default_flow_start import DefaultFlowStart
+
+__all__ = [
+    "PulsarTopology",
+    "TemplateSeed",
+    "WorkspaceInit",
+    "DefaultFlowStart",
+]
--- a/trustgraph-flow/trustgraph/bootstrap/initialisers/default_flow_start.py
+++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/default_flow_start.py
@ -0,0 +1,101 @@
+"""
+DefaultFlowStart initialiser — starts a named flow in a workspace
+using a specified blueprint.
+
+Separated from WorkspaceInit so deployments that want a workspace
+without an auto-started flow can simply omit this initialiser.
+
+Parameters
+----------
+workspace : str (default "default")
+    Workspace in which to start the flow.
+flow_id : str (default "default")
+    Identifier for the started flow.
+blueprint : str (required)
+    Blueprint name (must already exist in the workspace's config,
+    typically via TemplateSeed -> WorkspaceInit).
+description : str (default "Default")
+    Human-readable description passed to flow-svc.
+parameters : dict (optional)
+    Optional parameter overrides passed to start-flow.
+"""
+
+from trustgraph.schema import FlowRequest
+
+from .. base import Initialiser
+
+
+class DefaultFlowStart(Initialiser):
+
+    def __init__(
+            self,
+            workspace="default",
+            flow_id="default",
+            blueprint=None,
+            description="Default",
+            parameters=None,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if not blueprint:
+            raise ValueError(
+                "DefaultFlowStart requires 'blueprint'"
+            )
+        self.workspace = workspace
+        self.flow_id = flow_id
+        self.blueprint = blueprint
+        self.description = description
+        self.parameters = dict(parameters) if parameters else {}
+
+    async def run(self, ctx, old_flag, new_flag):
+
+        # Check whether the flow already exists.  Belt-and-braces
+        # beyond the flag gate: if an operator stops and restarts the
+        # bootstrapper after the flow is already running, we don't
+        # want to blindly try to start it again.
+        list_resp = await ctx.flow.request(
+            FlowRequest(
+                operation="list-flows",
+                workspace=self.workspace,
+            ),
+            timeout=10,
+        )
+        if list_resp.error:
+            raise RuntimeError(
+                f"list-flows failed: "
+                f"{list_resp.error.type}: {list_resp.error.message}"
+            )
+
+        if self.flow_id in (list_resp.flow_ids or []):
+            ctx.logger.info(
+                f"Flow {self.flow_id!r} already running in workspace "
+                f"{self.workspace!r}; nothing to do"
+            )
+            return
+
+        ctx.logger.info(
+            f"Starting flow {self.flow_id!r} "
+            f"(blueprint={self.blueprint!r}) "
+            f"in workspace {self.workspace!r}"
+        )
+
+        resp = await ctx.flow.request(
+            FlowRequest(
+                operation="start-flow",
+                workspace=self.workspace,
+                flow_id=self.flow_id,
+                blueprint_name=self.blueprint,
+                description=self.description,
+                parameters=self.parameters,
+            ),
+            timeout=30,
+        )
+        if resp.error:
+            raise RuntimeError(
+                f"start-flow failed: "
+                f"{resp.error.type}: {resp.error.message}"
+            )
+
+        ctx.logger.info(
+            f"Flow {self.flow_id!r} started"
+        )
--- a/trustgraph-flow/trustgraph/bootstrap/initialisers/pulsar_topology.py
+++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/pulsar_topology.py
@ -0,0 +1,131 @@
+"""
+PulsarTopology initialiser — creates Pulsar tenant and namespaces
+with their retention policies.
+
+Runs pre-gate (``wait_for_services = False``) because config-svc and
+flow-svc can't connect to Pulsar until these namespaces exist.
+Admin-API calls are idempotent so re-runs on flag change are safe.
+"""
+
+import asyncio
+import requests
+
+from .. base import Initialiser
+
+# Namespace configs.  flow/request take broker defaults.  response
+# and notify get aggressive retention — those classes carry short-lived
+# request/response and notification traffic only.
+NAMESPACE_CONFIG = {
+    "flow": {},
+    "request": {},
+    "response": {
+        "retention_policies": {
+            "retentionSizeInMB": -1,
+            "retentionTimeInMinutes": 3,
+            "subscriptionExpirationTimeMinutes": 30,
+        },
+    },
+    "notify": {
+        "retention_policies": {
+            "retentionSizeInMB": -1,
+            "retentionTimeInMinutes": 3,
+            "subscriptionExpirationTimeMinutes": 5,
+        },
+    },
+}
+
+REQUEST_TIMEOUT = 10
+
+
+class PulsarTopology(Initialiser):
+
+    wait_for_services = False
+
+    def __init__(
+            self,
+            admin_url="http://pulsar:8080",
+            tenant="tg",
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.admin_url = admin_url.rstrip("/")
+        self.tenant = tenant
+
+    async def run(self, ctx, old_flag, new_flag):
+        # requests is blocking; offload to executor so the loop stays
+        # responsive.
+        loop = asyncio.get_event_loop()
+        await loop.run_in_executor(None, self._reconcile_sync, ctx.logger)
+
+    # ------------------------------------------------------------------
+    # Sync admin-API calls.
+    # ------------------------------------------------------------------
+
+    def _get_clusters(self):
+        resp = requests.get(
+            f"{self.admin_url}/admin/v2/clusters",
+            timeout=REQUEST_TIMEOUT,
+        )
+        resp.raise_for_status()
+        return resp.json()
+
+    def _tenant_exists(self):
+        resp = requests.get(
+            f"{self.admin_url}/admin/v2/tenants/{self.tenant}",
+            timeout=REQUEST_TIMEOUT,
+        )
+        return resp.status_code == 200
+
+    def _create_tenant(self, clusters):
+        resp = requests.put(
+            f"{self.admin_url}/admin/v2/tenants/{self.tenant}",
+            json={"adminRoles": [], "allowedClusters": clusters},
+            timeout=REQUEST_TIMEOUT,
+        )
+        if resp.status_code != 204:
+            raise RuntimeError(
+                f"Tenant {self.tenant!r} create failed: "
+                f"{resp.status_code} {resp.text}"
+            )
+
+    def _namespace_exists(self, namespace):
+        resp = requests.get(
+            f"{self.admin_url}/admin/v2/namespaces/"
+            f"{self.tenant}/{namespace}",
+            timeout=REQUEST_TIMEOUT,
+        )
+        return resp.status_code == 200
+
+    def _create_namespace(self, namespace, config):
+        resp = requests.put(
+            f"{self.admin_url}/admin/v2/namespaces/"
+            f"{self.tenant}/{namespace}",
+            json=config,
+            timeout=REQUEST_TIMEOUT,
+        )
+        if resp.status_code != 204:
+            raise RuntimeError(
+                f"Namespace {self.tenant}/{namespace} create failed: "
+                f"{resp.status_code} {resp.text}"
+            )
+
+    def _reconcile_sync(self, logger):
+        if not self._tenant_exists():
+            clusters = self._get_clusters()
+            logger.info(
+                f"Creating tenant {self.tenant!r} with clusters {clusters}"
+            )
+            self._create_tenant(clusters)
+        else:
+            logger.debug(f"Tenant {self.tenant!r} already exists")
+
+        for namespace, config in NAMESPACE_CONFIG.items():
+            if self._namespace_exists(namespace):
+                logger.debug(
+                    f"Namespace {self.tenant}/{namespace} already exists"
+                )
+                continue
+            logger.info(
+                f"Creating namespace {self.tenant}/{namespace}"
+            )
+            self._create_namespace(namespace, config)
--- a/trustgraph-flow/trustgraph/bootstrap/initialisers/template_seed.py
+++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/template_seed.py
@ -0,0 +1,93 @@
+"""
+TemplateSeed initialiser — populates the reserved ``__template__``
+workspace from an external JSON seed file.
+
+Seed file shape:
+
+.. code-block:: json
+
+    {
+        "flow-blueprint": {
+            "ontology": { ... },
+            "agent":    { ... }
+        },
+        "prompt": {
+            ...
+        },
+        ...
+    }
+
+Top-level keys are config types; nested keys are config entries.
+Values are arbitrary JSON (they'll be ``json.dumps()``'d on write).
+
+Parameters
+----------
+config_file : str
+    Path to the seed file on disk.
+overwrite : bool (default False)
+    On re-run (flag change), if True overwrite all keys; if False
+    upsert-missing-only (preserves any operator customisation of
+    the template).
+"""
+
+import json
+
+from .. base import Initialiser
+
+TEMPLATE_WORKSPACE = "__template__"
+
+
+class TemplateSeed(Initialiser):
+
+    def __init__(self, config_file, overwrite=False, **kwargs):
+        super().__init__(**kwargs)
+        if not config_file:
+            raise ValueError("TemplateSeed requires 'config_file'")
+        self.config_file = config_file
+        self.overwrite = overwrite
+
+    async def run(self, ctx, old_flag, new_flag):
+
+        with open(self.config_file) as f:
+            seed = json.load(f)
+
+        if old_flag is None:
+            # Clean first run — write every entry.
+            await self._write_all(ctx, seed)
+            return
+
+        # Re-run after flag change.
+        if self.overwrite:
+            await self._write_all(ctx, seed)
+        else:
+            await self._upsert_missing(ctx, seed)
+
+    async def _write_all(self, ctx, seed):
+        values = []
+        for type_name, entries in seed.items():
+            for key, value in entries.items():
+                values.append((type_name, key, json.dumps(value)))
+        if values:
+            await ctx.config.put_many(TEMPLATE_WORKSPACE, values)
+        ctx.logger.info(
+            f"Template seeded with {len(values)} entries"
+        )
+
+    async def _upsert_missing(self, ctx, seed):
+        written = 0
+        for type_name, entries in seed.items():
+            existing = set(
+                await ctx.config.keys(TEMPLATE_WORKSPACE, type_name)
+            )
+            values = []
+            for key, value in entries.items():
+                if key not in existing:
+                    values.append(
+                        (type_name, key, json.dumps(value))
+                    )
+            if values:
+                await ctx.config.put_many(TEMPLATE_WORKSPACE, values)
+                written += len(values)
+        ctx.logger.info(
+            f"Template upsert-missing: {written} new entries"
+        )
--- a/trustgraph-flow/trustgraph/bootstrap/initialisers/workspace_init.py
+++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/workspace_init.py
@ -0,0 +1,138 @@
+"""
+WorkspaceInit initialiser — creates a workspace and populates it from
+either the ``__template__`` workspace or a seed file on disk.
+
+Parameters
+----------
+workspace : str
+    Target workspace to create / populate.
+source : str
+    Either ``"template"`` (copy the full contents of the
+    ``__template__`` workspace) or ``"seed-file"`` (read from
+    ``seed_file``).
+seed_file : str (required when source=="seed-file")
+    Path to a JSON seed file with the same shape TemplateSeed consumes.
+overwrite : bool (default False)
+    On re-run (flag change), if True overwrite all keys; if False,
+    upsert-missing-only (preserves in-workspace customisations).
+
+Raises (in ``run``)
+-------------------
+When source is ``"template"``, raises ``RuntimeError`` if the
+``__template__`` workspace is empty — indicating that TemplateSeed
+hasn't run yet.  The bootstrapper's retry loop will re-attempt on
+the next cycle once the prerequisite is satisfied.
+"""
+
+import json
+
+from .. base import Initialiser
+
+TEMPLATE_WORKSPACE = "__template__"
+
+
+class WorkspaceInit(Initialiser):
+
+    def __init__(
+            self,
+            workspace="default",
+            source="template",
+            seed_file=None,
+            overwrite=False,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if source not in ("template", "seed-file"):
+            raise ValueError(
+                f"WorkspaceInit: source must be 'template' or "
+                f"'seed-file', got {source!r}"
+            )
+        if source == "seed-file" and not seed_file:
+            raise ValueError(
+                "WorkspaceInit: seed_file required when source='seed-file'"
+            )
+
+        self.workspace = workspace
+        self.source = source
+        self.seed_file = seed_file
+        self.overwrite = overwrite
+
+    async def run(self, ctx, old_flag, new_flag):
+        if self.source == "seed-file":
+            tree = self._load_seed_file()
+        else:
+            tree = await self._load_from_template(ctx)
+
+        if old_flag is None or self.overwrite:
+            await self._write_all(ctx, tree)
+        else:
+            await self._upsert_missing(ctx, tree)
+
+    def _load_seed_file(self):
+        with open(self.seed_file) as f:
+            return json.load(f)
+
+    async def _load_from_template(self, ctx):
+        """Build a seed tree from the entire ``__template__`` workspace.
+        Raises if the workspace is empty, so the bootstrapper knows
+        the prerequisite isn't met yet."""
+
+        raw_tree = await ctx.config.get_all(TEMPLATE_WORKSPACE)
+
+        tree = {}
+        total = 0
+        for type_name, entries in raw_tree.items():
+            parsed = {}
+            for key, raw in entries.items():
+                if raw is None:
+                    continue
+                try:
+                    parsed[key] = json.loads(raw)
+                except Exception:
+                    parsed[key] = raw
+                total += 1
+            if parsed:
+                tree[type_name] = parsed
+
+        if total == 0:
+            raise RuntimeError(
+                "Template workspace is empty — has TemplateSeed run yet?"
+            )
+
+        ctx.logger.debug(
+            f"Loaded {total} template entries across {len(tree)} types"
+        )
+        return tree
+
+    async def _write_all(self, ctx, tree):
+        values = []
+        for type_name, entries in tree.items():
+            for key, value in entries.items():
+                values.append((type_name, key, json.dumps(value)))
+        if values:
+            await ctx.config.put_many(self.workspace, values)
+        ctx.logger.info(
+            f"Workspace {self.workspace!r} populated with "
+            f"{len(values)} entries"
+        )
+
+    async def _upsert_missing(self, ctx, tree):
+        written = 0
+        for type_name, entries in tree.items():
+            existing = set(
+                await ctx.config.keys(self.workspace, type_name)
+            )
+            values = []
+            for key, value in entries.items():
+                if key not in existing:
+                    values.append(
+                        (type_name, key, json.dumps(value))
+                    )
+            if values:
+                await ctx.config.put_many(self.workspace, values)
+                written += len(values)
+        ctx.logger.info(
+            f"Workspace {self.workspace!r} upsert-missing: "
+            f"{written} new entries"
+        )
--- a/trustgraph-flow/trustgraph/config/service/service.py
+++ b/trustgraph-flow/trustgraph/config/service/service.py
@ -24,6 +24,21 @@ logger = logging.getLogger(__name__)

 default_ident = "config-svc"

+
+def is_reserved_workspace(workspace):
+    """Reserved workspaces are storage-only.
+
+    Any workspace id beginning with ``_`` is reserved for internal use
+    (e.g. ``__template__`` holding factory-default seed config).
+    Reads and writes work normally so bootstrap and provisioning code
+    can use the standard config API, but **change notifications for
+    reserved workspaces are suppressed**.  Services subscribed to the
+    config push therefore never see reserved-workspace events and
+    cannot accidentally act on template content as if it were live
+    state.
+    """
+    return workspace.startswith("_")
+
 default_config_request_queue = config_request_queue
 default_config_response_queue = config_response_queue
 default_config_push_queue = config_push_queue
@ -130,6 +145,21 @@ class Processor(AsyncProcessor):

    async def push(self, changes=None):

+        # Suppress notifications from reserved workspaces (ids starting
+        # with "_", e.g. "__template__").  Stored config is preserved;
+        # only the broadcast is filtered.  Keeps services oblivious to
+        # template / bootstrap state.
+        if changes:
+            filtered = {}
+            for type_name, workspaces in changes.items():
+                visible = [
+                    w for w in workspaces
+                    if not is_reserved_workspace(w)
+                ]
+                if visible:
+                    filtered[type_name] = visible
+            changes = filtered
+
        version = await self.config.get_version()

        resp = ConfigPush(
--- a/trustgraph-flow/trustgraph/embeddings/ollama/processor.py
+++ b/trustgraph-flow/trustgraph/embeddings/ollama/processor.py
@ -5,7 +5,7 @@ Input is text, output is embeddings vector.
 """
 from ... base import EmbeddingsService

-from ollama import Client
+from ollama import AsyncClient
 import os
 import logging

@ -30,24 +30,24 @@ class Processor(EmbeddingsService):
            }
        )

-        self.client = Client(host=ollama)
+        self.client = AsyncClient(host=ollama)
        self.default_model = model
        self._checked_models = set()

-    def _ensure_model(self, model_name):
+    async def _ensure_model(self, model_name):
        """Check if model exists locally, pull it if not."""
        if model_name in self._checked_models:
            return

        try:
-            self.client.show(model_name)
+            await self.client.show(model_name)
            self._checked_models.add(model_name)
        except Exception as e:
            status_code = getattr(e, 'status_code', None)
            if status_code == 404 or "not found" in str(e).lower():
                logger.info(f"Ollama model '{model_name}' not found locally. Pulling, this may take a while...")
                try:
-                    self.client.pull(model_name)
+                    await self.client.pull(model_name)
                    self._checked_models.add(model_name)
                    logger.info(f"Successfully pulled Ollama model '{model_name}'.")
                except Exception as pull_e:
@ -63,10 +63,10 @@ class Processor(EmbeddingsService):
        use_model = model or self.default_model

        # Ensure the model exists/is pulled
-        self._ensure_model(use_model)
+        await self._ensure_model(use_model)

        # Ollama handles batch input efficiently
-        embeds = self.client.embed(
+        embeds = await self.client.embed(
            model = use_model,
            input = texts
        )
--- a/trustgraph-flow/trustgraph/gateway/auth.py
+++ b/trustgraph-flow/trustgraph/gateway/auth.py
@ -1,22 +1,371 @@
+"""
+IAM-backed authentication and authorisation for the API gateway.

-class Authenticator:
+The gateway delegates both authentication ("who is this caller?")
+and authorisation ("may they do this?") to the IAM regime via the
+contract specified in docs/tech-specs/iam-contract.md.  No regime-
+specific policy (roles, scopes, claims) lives in the gateway.

-    def __init__(self, token=None, allow_all=False):
+- Authentication: API keys are resolved by IAM; JWTs are validated
+  locally against the cached signing public key.
+- Authorisation: every per-request decision is asked of IAM via
+  ``authorise(identity, capability, resource, parameters)``, with
+  results cached for the TTL the regime returns.
+"""

-        if not allow_all and token is None:
-            raise RuntimeError("Need a token")
+import asyncio
+import base64
+import hashlib
+import json
+import logging
+import time
+import uuid
+from dataclasses import dataclass, field

-        if not allow_all and token == "":
-            raise RuntimeError("Need a token")
+from aiohttp import web

-        self.token = token
-        self.allow_all = allow_all
+from cryptography.hazmat.primitives import serialization
+from cryptography.hazmat.primitives.asymmetric import ed25519

-    def permitted(self, token, roles):
+from ..base.iam_client import IamClient
+from ..base.metrics import ProducerMetrics, SubscriberMetrics
+from ..schema import (
+    IamRequest, IamResponse,
+    iam_request_queue, iam_response_queue,
+)

-        if self.allow_all: return True
+logger = logging.getLogger("auth")

-        if self.token != token: return False
+API_KEY_CACHE_TTL = 60  # seconds

-        return True
+# Upper bound on cache TTL the gateway honours for an authorisation
+# decision, regardless of what the regime suggested.  Caps the
+# revocation latency window.
+AUTHZ_CACHE_TTL_MAX = 60  # seconds

+
+@dataclass
+class Identity:
+    """The gateway-side surface of an authenticated caller.
+
+    Per the IAM contract this is a small fixed shape; regime-internal
+    state (roles, claims, group memberships) is reachable only via
+    the regime's ``authorise`` operation.  The gateway itself never
+    reads policy from this object.
+    """
+    # Opaque handle, quoted back when calling ``authorise``.  For
+    # the OSS regime this is the user record's id; the gateway
+    # treats it as a string with no semantic content.
+    handle: str
+    # The workspace this credential authenticates to.  Used by the
+    # gateway as the default-fill-in for operations that omit a
+    # workspace.  Never used as policy input.
+    workspace: str
+    # Stable identifier for audit logs.  In OSS this is the same
+    # value as ``handle``; not assumed equal in the contract.
+    principal_id: str
+    # How the credential was presented.  Non-policy; useful for
+    # logs / metrics only.
+    source: str   # "api-key" | "jwt"
+
+
+def _auth_failure():
+    return web.HTTPUnauthorized(
+        text='{"error":"auth failure"}',
+        content_type="application/json",
+    )
+
+
+def _access_denied():
+    return web.HTTPForbidden(
+        text='{"error":"access denied"}',
+        content_type="application/json",
+    )
+
+
+def _b64url_decode(s):
+    pad = "=" * (-len(s) % 4)
+    return base64.urlsafe_b64decode(s + pad)
+
+
+def _verify_jwt_eddsa(token, public_pem):
+    """Verify an Ed25519 JWT and return its claims.  Raises on any
+    validation failure.  Refuses non-EdDSA algorithms."""
+    parts = token.split(".")
+    if len(parts) != 3:
+        raise ValueError("malformed JWT")
+    h_b64, p_b64, s_b64 = parts
+    signing_input = f"{h_b64}.{p_b64}".encode("ascii")
+    header = json.loads(_b64url_decode(h_b64))
+    if header.get("alg") != "EdDSA":
+        raise ValueError(f"unsupported alg: {header.get('alg')!r}")
+
+    key = serialization.load_pem_public_key(public_pem.encode("ascii"))
+    if not isinstance(key, ed25519.Ed25519PublicKey):
+        raise ValueError("public key is not Ed25519")
+
+    signature = _b64url_decode(s_b64)
+    key.verify(signature, signing_input)  # raises InvalidSignature
+
+    claims = json.loads(_b64url_decode(p_b64))
+    exp = claims.get("exp")
+    if exp is None or exp < time.time():
+        raise ValueError("expired")
+    return claims
+
+
+class IamAuth:
+    """Resolves bearer credentials via the IAM service.
+
+    Used by every gateway endpoint that needs authentication.  Fetches
+    the IAM signing public key at startup (cached in memory).  API
+    keys are resolved via the IAM service with a local hash→identity
+    cache (short TTL so revoked keys stop working within the TTL
+    window without any push mechanism)."""
+
+    def __init__(self, backend, id="api-gateway"):
+        self.backend = backend
+        self.id = id
+
+        # Populated at start() via IAM.
+        self._signing_public_pem = None
+
+        # API-key cache: plaintext_sha256_hex -> (Identity, expires_ts)
+        self._key_cache = {}
+        self._key_cache_lock = asyncio.Lock()
+
+        # Authorisation decision cache: hash(handle, capability,
+        # resource, parameters) -> (allow_bool, expires_ts).  Holds
+        # both allows and denies — denies cached briefly to avoid
+        # hammering iam-svc with repeated rejected attempts.
+        self._authz_cache: dict[str, tuple[bool, float]] = {}
+        self._authz_cache_lock = asyncio.Lock()
+
+    # ------------------------------------------------------------------
+    # Short-lived client helper.  Mirrors the pattern used by the
+    # bootstrap framework and AsyncProcessor: a fresh uuid suffix per
+    # invocation so Pulsar exclusive subscriptions don't collide with
+    # ghosts from prior calls.
+    # ------------------------------------------------------------------
+
+    def _make_client(self):
+        rr_id = str(uuid.uuid4())
+        return IamClient(
+            backend=self.backend,
+            subscription=f"{self.id}--iam--{rr_id}",
+            consumer_name=self.id,
+            request_topic=iam_request_queue,
+            request_schema=IamRequest,
+            request_metrics=ProducerMetrics(
+                processor=self.id, flow=None, name="iam-request",
+            ),
+            response_topic=iam_response_queue,
+            response_schema=IamResponse,
+            response_metrics=SubscriberMetrics(
+                processor=self.id, flow=None, name="iam-response",
+            ),
+        )
+
+    async def _with_client(self, op):
+        """Open a short-lived IamClient, run ``op(client)``, close."""
+        client = self._make_client()
+        await client.start()
+        try:
+            return await op(client)
+        finally:
+            try:
+                await client.stop()
+            except Exception:
+                pass
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    async def start(self, max_retries=30, retry_delay=2.0):
+        """Fetch the signing public key from IAM.  Retries on
+        failure — the gateway may be starting before IAM is ready."""
+
+        async def _fetch(client):
+            return await client.get_signing_key_public()
+
+        for attempt in range(max_retries):
+            try:
+                pem = await self._with_client(_fetch)
+                if pem:
+                    self._signing_public_pem = pem
+                    logger.info(
+                        "IamAuth: fetched IAM signing public key "
+                        f"({len(pem)} bytes)"
+                    )
+                    return
+            except Exception as e:
+                logger.info(
+                    f"IamAuth: waiting for IAM signing key "
+                    f"({type(e).__name__}: {e}); "
+                    f"retry {attempt + 1}/{max_retries}"
+                )
+            await asyncio.sleep(retry_delay)
+
+        # Don't prevent startup forever.  A later authenticate() call
+        # will try again via the JWT path.
+        logger.warning(
+            "IamAuth: could not fetch IAM signing key at startup; "
+            "JWT validation will fail until it's available"
+        )
+
+    # ------------------------------------------------------------------
+    # Authentication
+    # ------------------------------------------------------------------
+
+    async def authenticate(self, request):
+        """Extract and validate the Bearer credential from an HTTP
+        request.  Returns an ``Identity``.  Raises HTTPUnauthorized
+        (401 / "auth failure") on any failure mode — the caller
+        cannot distinguish missing / malformed / invalid / expired /
+        revoked credentials."""
+
+        header = request.headers.get("Authorization", "")
+        if not header.startswith("Bearer "):
+            raise _auth_failure()
+        token = header[len("Bearer "):].strip()
+        if not token:
+            raise _auth_failure()
+
+        # API keys always start with "tg_".  JWTs have two dots and
+        # no "tg_" prefix.  Discriminate cheaply.
+        if token.startswith("tg_"):
+            return await self._resolve_api_key(token)
+        if token.count(".") == 2:
+            return self._verify_jwt(token)
+        raise _auth_failure()
+
+    def _verify_jwt(self, token):
+        if not self._signing_public_pem:
+            raise _auth_failure()
+        try:
+            claims = _verify_jwt_eddsa(token, self._signing_public_pem)
+        except Exception as e:
+            logger.debug(f"JWT validation failed: {type(e).__name__}: {e}")
+            raise _auth_failure()
+
+        sub = claims.get("sub", "")
+        ws = claims.get("workspace", "")
+        if not sub or not ws:
+            raise _auth_failure()
+
+        # JWT carries no policy state under the IAM contract;
+        # any roles / claims field is ignored here.
+        return Identity(
+            handle=sub, workspace=ws, principal_id=sub, source="jwt",
+        )
+
+    async def _resolve_api_key(self, plaintext):
+        h = hashlib.sha256(plaintext.encode("utf-8")).hexdigest()
+
+        cached = self._key_cache.get(h)
+        now = time.time()
+        if cached and cached[1] > now:
+            return cached[0]
+
+        async with self._key_cache_lock:
+            cached = self._key_cache.get(h)
+            if cached and cached[1] > now:
+                return cached[0]
+
+            try:
+                async def _call(client):
+                    return await client.resolve_api_key(plaintext)
+                # ``roles`` is returned by the OSS regime as a hint
+                # but is not consulted by the gateway; all policy
+                # decisions go through ``authorise``.
+                user_id, workspace, _roles = await self._with_client(_call)
+            except Exception as e:
+                logger.debug(
+                    f"API key resolution failed: "
+                    f"{type(e).__name__}: {e}"
+                )
+                raise _auth_failure()
+
+            if not user_id or not workspace:
+                raise _auth_failure()
+
+            identity = Identity(
+                handle=user_id, workspace=workspace,
+                principal_id=user_id, source="api-key",
+            )
+            self._key_cache[h] = (identity, now + API_KEY_CACHE_TTL)
+            return identity
+
+    # ------------------------------------------------------------------
+    # Authorisation
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _authz_cache_key(handle, capability, resource, parameters):
+        payload = json.dumps(
+            {
+                "h": handle,
+                "c": capability,
+                "r": resource or {},
+                "p": parameters or {},
+            },
+            sort_keys=True,
+            separators=(",", ":"),
+        )
+        return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+
+    async def authorise(self, identity, capability, resource, parameters):
+        """Ask the IAM regime whether ``identity`` may perform
+        ``capability`` on ``resource`` given ``parameters``.
+
+        Caches the decision for the regime's suggested TTL, clamped
+        above by ``AUTHZ_CACHE_TTL_MAX``.  Both allow and deny
+        decisions are cached (denies briefly, to avoid hammering
+        iam-svc with repeated rejected attempts).
+
+        Raises ``HTTPForbidden`` (403 / "access denied") on a deny
+        decision.  Raises ``HTTPUnauthorized`` (401 / "auth failure")
+        if the IAM service errors out — failing closed."""
+
+        key = self._authz_cache_key(
+            identity.handle, capability, resource, parameters,
+        )
+        now = time.time()
+
+        cached = self._authz_cache.get(key)
+        if cached and cached[1] > now:
+            allow, _ = cached
+            if not allow:
+                raise _access_denied()
+            return
+
+        async with self._authz_cache_lock:
+            cached = self._authz_cache.get(key)
+            if cached and cached[1] > now:
+                allow, _ = cached
+                if not allow:
+                    raise _access_denied()
+                return
+
+            try:
+                async def _call(client):
+                    return await client.authorise(
+                        identity.handle, capability,
+                        resource or {}, parameters or {},
+                    )
+                allow, ttl = await self._with_client(_call)
+            except Exception as e:
+                logger.warning(
+                    f"authorise failed: {type(e).__name__}: {e}; "
+                    f"failing closed for "
+                    f"{identity.principal_id!r} cap={capability!r}"
+                )
+                raise _auth_failure()
+
+            ttl = max(0, min(int(ttl or 0), AUTHZ_CACHE_TTL_MAX))
+            self._authz_cache[key] = (bool(allow), now + ttl)
+
+            if not allow:
+                raise _access_denied()
+            return
--- a/trustgraph-flow/trustgraph/gateway/capabilities.py
+++ b/trustgraph-flow/trustgraph/gateway/capabilities.py
@ -0,0 +1,100 @@
+"""
+Gateway-side authorisation entry points.
+
+Under the IAM contract (see docs/tech-specs/iam-contract.md) the
+gateway holds *no* policy state.  Roles, capability sets, and
+workspace-scope rules all live in the IAM regime (iam-svc for OSS).
+This module is the thin surface the gateway uses to ask the regime
+for a decision:
+
+- ``PUBLIC`` / ``AUTHENTICATED`` sentinels for endpoints that don't
+  go through capability-based authorisation.
+- :func:`enforce` — authenticate-only, then ask the regime.
+- :func:`enforce_workspace` — default-fill the workspace from the
+  caller's bound workspace and ask the regime, with the workspace
+  treated as the resource address.
+
+The capability strings themselves are an open vocabulary — see
+docs/tech-specs/capabilities.md.  The gateway does not validate them
+beyond passing them through; an unknown capability simply produces a
+deny verdict from the regime.
+"""
+
+from aiohttp import web
+
+
+PUBLIC = "__public__"
+AUTHENTICATED = "__authenticated__"
+
+
+def access_denied():
+    return web.HTTPForbidden(
+        text='{"error":"access denied"}',
+        content_type="application/json",
+    )
+
+
+def auth_failure():
+    return web.HTTPUnauthorized(
+        text='{"error":"auth failure"}',
+        content_type="application/json",
+    )
+
+
+async def enforce(request, auth, capability):
+    """Authenticate the caller and (for non-sentinel capabilities)
+    ask the IAM regime whether they may invoke ``capability``.
+
+    The resource is system-level (``{}``) and parameters are empty —
+    use :func:`enforce_workspace` for workspace-scoped endpoints, or
+    drive authorisation through the operation registry for richer
+    cases.
+
+    - ``PUBLIC``: returns ``None`` — no authentication.
+    - ``AUTHENTICATED``: returns the ``Identity`` — no authorisation.
+    - capability string: returns the ``Identity`` if the regime
+      allows; raises ``HTTPForbidden`` otherwise.
+    """
+    if capability == PUBLIC:
+        return None
+
+    identity = await auth.authenticate(request)
+
+    if capability == AUTHENTICATED:
+        return identity
+
+    await auth.authorise(identity, capability, {}, {})
+    return identity
+
+
+async def enforce_workspace(data, identity, auth, capability=None):
+    """Default-fill the workspace on a request body and (optionally)
+    authorise the caller for ``capability`` against that workspace.
+
+    - Target workspace = ``data["workspace"]`` if supplied, else the
+      caller's bound workspace.
+    - On success, ``data["workspace"]`` is overwritten with the
+      resolved value so downstream code sees a single canonical
+      address.
+    - When ``capability`` is given, the regime is asked whether the
+      caller may invoke ``capability`` on ``{workspace: target}``.
+      Raises ``HTTPForbidden`` on a deny.
+
+    For ``capability=None`` no authorisation call is made — the
+    caller has presumably already authorised via :func:`enforce`
+    (handy for endpoints that authorise once then resolve workspace
+    on the body before forwarding).
+    """
+    if not isinstance(data, dict):
+        return data
+
+    requested = data.get("workspace", "")
+    target = requested or identity.workspace
+    data["workspace"] = target
+
+    if capability is not None:
+        await auth.authorise(
+            identity, capability, {"workspace": target}, {},
+        )
+
+    return data
--- a/trustgraph-flow/trustgraph/gateway/dispatch/iam.py
+++ b/trustgraph-flow/trustgraph/gateway/dispatch/iam.py
@ -0,0 +1,40 @@
+
+from ... schema import IamRequest, IamResponse
+from ... schema import iam_request_queue, iam_response_queue
+from ... messaging import TranslatorRegistry
+
+from . requestor import ServiceRequestor
+
+
+class IamRequestor(ServiceRequestor):
+    def __init__(self, backend, consumer, subscriber, timeout=120,
+                 request_queue=None, response_queue=None):
+
+        if request_queue is None:
+            request_queue = iam_request_queue
+        if response_queue is None:
+            response_queue = iam_response_queue
+
+        super().__init__(
+            backend=backend,
+            consumer_name=consumer,
+            subscription=subscriber,
+            request_queue=request_queue,
+            response_queue=response_queue,
+            request_schema=IamRequest,
+            response_schema=IamResponse,
+            timeout=timeout,
+        )
+
+        self.request_translator = (
+            TranslatorRegistry.get_request_translator("iam")
+        )
+        self.response_translator = (
+            TranslatorRegistry.get_response_translator("iam")
+        )
+
+    def to_request(self, body):
+        return self.request_translator.decode(body)
+
+    def from_response(self, message):
+        return self.response_translator.encode_with_completion(message)
--- a/trustgraph-flow/trustgraph/gateway/dispatch/manager.py
+++ b/trustgraph-flow/trustgraph/gateway/dispatch/manager.py
@ -9,6 +9,7 @@ logger = logging.getLogger(__name__)

 from . config import ConfigRequestor
 from . flow import FlowRequestor
+from . iam import IamRequestor
 from . librarian import LibrarianRequestor
 from . knowledge import KnowledgeRequestor
 from . collection_management import CollectionManagementRequestor
@ -72,6 +73,7 @@ request_response_dispatchers = {
 global_dispatchers = {
    "config": ConfigRequestor,
    "flow": FlowRequestor,
+    "iam": IamRequestor,
    "librarian": LibrarianRequestor,
    "knowledge": KnowledgeRequestor,
    "collection-management": CollectionManagementRequestor,
@ -105,13 +107,31 @@ class DispatcherWrapper:

 class DispatcherManager:

-    def __init__(self, backend, config_receiver, prefix="api-gateway",
-                 queue_overrides=None):
+    def __init__(self, backend, config_receiver, auth,
+                 prefix="api-gateway", queue_overrides=None):
+        """
+        ``auth`` is required.  It flows into the Mux for first-frame
+        WebSocket authentication and into downstream dispatcher
+        construction.  There is no permissive default — constructing
+        a DispatcherManager without an authenticator would be a
+        silent downgrade to no-auth on the socket path.
+        """
+        if auth is None:
+            raise ValueError(
+                "DispatcherManager requires an 'auth' argument — there "
+                "is no no-auth mode"
+            )
+
        self.backend = backend
        self.config_receiver = config_receiver
        self.config_receiver.add_handler(self)
        self.prefix = prefix

+        # Gateway IamAuth — used by the socket Mux for first-frame
+        # auth and by any dispatcher that needs to resolve caller
+        # identity out-of-band.
+        self.auth = auth
+
        # Store queue overrides for global services
        # Format: {"config": {"request": "...", "response": "..."}, ...}
        self.queue_overrides = queue_overrides or {}
@ -163,6 +183,15 @@ class DispatcherManager:
    def dispatch_global_service(self):
        return DispatcherWrapper(self.process_global_service)

+    def dispatch_auth_iam(self):
+        """Pre-configured IAM dispatcher for the gateway's auth
+        endpoints (login, bootstrap, change-password).  Pins the
+        kind to ``iam`` so these handlers don't have to supply URL
+        params the global dispatcher would expect."""
+        async def _process(data, responder):
+            return await self.invoke_global_service(data, responder, "iam")
+        return DispatcherWrapper(_process)
+
    def dispatch_core_export(self):
        return DispatcherWrapper(self.process_core_export)

@ -314,7 +343,10 @@ class DispatcherManager:

    async def process_socket(self, ws, running, params):

-        dispatcher = Mux(self, ws, running)
+        # The mux self-authenticates via the first-frame protocol;
+        # pass the gateway's IamAuth so it can validate tokens
+        # without reaching back into the endpoint layer.
+        dispatcher = Mux(self, ws, running, auth=self.auth)

        return dispatcher

--- a/trustgraph-flow/trustgraph/gateway/dispatch/mux.py
+++ b/trustgraph-flow/trustgraph/gateway/dispatch/mux.py
@ -16,11 +16,28 @@ MAX_QUEUE_SIZE = 10

 class Mux:

-    def __init__(self, dispatcher_manager, ws, running):
+    def __init__(self, dispatcher_manager, ws, running, auth):
+        """
+        ``auth`` is required — the Mux implements the first-frame
+        auth protocol described in ``iam.md`` and will refuse any
+        non-auth frame until an ``auth-ok`` has been issued.  There
+        is no no-auth mode.
+        """
+        if auth is None:
+            raise ValueError(
+                "Mux requires an 'auth' argument — there is no "
+                "no-auth mode"
+            )

        self.dispatcher_manager = dispatcher_manager
        self.ws = ws
        self.running = running
+        self.auth = auth
+
+        # Authenticated identity, populated by the first-frame auth
+        # protocol.  ``None`` means the socket is not yet
+        # authenticated; any non-auth frame is refused.
+        self.identity = None

        self.q = asyncio.Queue(maxsize=MAX_QUEUE_SIZE)

@ -31,6 +48,41 @@ class Mux:
        if self.ws:
            await self.ws.close()

+    async def _handle_auth_frame(self, data):
+        """Process a ``{"type": "auth", "token": "..."}`` frame.
+        On success, updates ``self.identity`` and returns an
+        ``auth-ok`` response frame.  On failure, returns the masked
+        auth-failure frame.  Never raises — auth failures keep the
+        socket open so the client can retry without reconnecting
+        (important for browsers, which treat a handshake-time 401
+        as terminal)."""
+        token = data.get("token", "")
+        if not token:
+            await self.ws.send_json({
+                "type": "auth-failed",
+                "error": "auth failure",
+            })
+            return
+
+        class _Shim:
+            def __init__(self, tok):
+                self.headers = {"Authorization": f"Bearer {tok}"}
+
+        try:
+            identity = await self.auth.authenticate(_Shim(token))
+        except Exception:
+            await self.ws.send_json({
+                "type": "auth-failed",
+                "error": "auth failure",
+            })
+            return
+
+        self.identity = identity
+        await self.ws.send_json({
+            "type": "auth-ok",
+            "workspace": identity.workspace,
+        })
+
    async def receive(self, msg):

        request_id = None
@ -38,6 +90,16 @@ class Mux:
        try:

            data = msg.json()
+
+            # In-band auth protocol: the client sends
+            # ``{"type": "auth", "token": "..."}`` as its first frame
+            # (and any time it wants to re-auth: JWT refresh, token
+            # rotation, etc).  Auth is always required on a Mux —
+            # there is no no-auth mode.
+            if isinstance(data, dict) and data.get("type") == "auth":
+                await self._handle_auth_frame(data)
+                return
+
            request_id = data.get("id")

            if "request" not in data:
@ -46,9 +108,125 @@ class Mux:
            if "id" not in data:
                raise RuntimeError("Bad message")

+            # Reject all non-auth frames until an ``auth-ok`` has
+            # been issued.
+            if self.identity is None:
+                await self.ws.send_json({
+                    "id": request_id,
+                    "error": {
+                        "message": "auth failure",
+                        "type": "auth-required",
+                    },
+                    "complete": True,
+                })
+                return
+
+            # Per-service capability gating.  Resolved through the
+            # operation registry so the WS path matches what HTTP
+            # callers see — same authority, same caps.
+            #
+            # Lookup mirrors the HTTP routing decision in
+            # ``request_task``: presence of ``flow`` on the envelope
+            # means a flow-level data-plane service (graph-rag,
+            # agent, …); absence means a workspace-level service
+            # (config, flow management, librarian, …) whose specific
+            # operation is in the inner request body.  ``iam`` is
+            # treated as workspace-level too — its operations are
+            # registered with bare names, no kind prefix.
+            from ..registry import lookup as _registry_lookup
+            from ..capabilities import enforce_workspace
+            from aiohttp import web as _web
+
+            service = data.get("service", "")
+            inner = data.get("request") or {}
+            inner_op = inner.get("operation", "") if isinstance(inner, dict) else ""
+
+            if data.get("flow"):
+                op = _registry_lookup(f"flow-service:{service}")
+            elif service == "iam":
+                op = _registry_lookup(inner_op) if inner_op else None
+            else:
+                op = _registry_lookup(f"{service}:{inner_op}") if inner_op else None
+
+            if op is None:
+                await self.ws.send_json({
+                    "id": request_id,
+                    "error": {
+                        "message": "unknown service",
+                        "type": "unknown-service",
+                    },
+                    "complete": True,
+                })
+                return
+
+            # Resolve workspace first (default-fill from the caller's
+            # bound workspace), then ask the regime to authorise the
+            # service-level capability against the matched
+            # operation's resource shape.
+            try:
+                await enforce_workspace(data, self.identity, self.auth)
+                if isinstance(inner, dict):
+                    await enforce_workspace(inner, self.identity, self.auth)
+
+                if data.get("flow"):
+                    resource = {
+                        "workspace": data.get("workspace", ""),
+                        "flow": data.get("flow", ""),
+                    }
+                    parameters = {}
+                else:
+                    # Build a minimal RequestContext so the matched
+                    # operation's own extractors decide resource and
+                    # parameters — same path the HTTP endpoints take.
+                    from ..registry import RequestContext
+                    ctx = RequestContext(
+                        body=inner if isinstance(inner, dict) else {},
+                        match_info={},
+                        identity=self.identity,
+                    )
+                    resource = op.extract_resource(ctx)
+                    parameters = op.extract_parameters(ctx)
+
+                await self.auth.authorise(
+                    self.identity, op.capability, resource, parameters,
+                )
+            except _web.HTTPForbidden:
+                await self.ws.send_json({
+                    "id": request_id,
+                    "error": {
+                        "message": "access denied",
+                        "type": "access-denied",
+                    },
+                    "complete": True,
+                })
+                return
+            except _web.HTTPUnauthorized:
+                await self.ws.send_json({
+                    "id": request_id,
+                    "error": {
+                        "message": "auth failure",
+                        "type": "auth-required",
+                    },
+                    "complete": True,
+                })
+                return
+
+            workspace = data["workspace"]
+
+            # Plumb authenticated caller's handle as ``actor`` so
+            # iam-svc handlers (whoami, future actor-scoped checks)
+            # know who is calling.  Overwrite any caller-supplied
+            # value so it can't be spoofed over the WS.
+            if (
+                service == "iam"
+                and isinstance(data.get("request"), dict)
+                and self.identity is not None
+            ):
+                data["request"]["actor"] = self.identity.handle
+
            await self.q.put((
                    data["id"],
-                    data.get("workspace", "default"),
+                    workspace,
                    data.get("flow"),
                    data["service"],
                    data["request"]
--- a/trustgraph-flow/trustgraph/gateway/endpoint/auth_endpoints.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/auth_endpoints.py
@ -0,0 +1,131 @@
+"""
+Gateway auth endpoints.
+
+Three dedicated paths:
+  POST /api/v1/auth/login            — unauthenticated; username/password → JWT
+  POST /api/v1/auth/bootstrap         — unauthenticated; IAM bootstrap op
+  POST /api/v1/auth/change-password   — authenticated; any role
+
+These are the only IAM-surface operations that can be reached from
+outside.  Everything else routes through ``/api/v1/iam`` gated by
+``users:admin``.
+"""
+
+import logging
+
+from aiohttp import web
+
+from .. capabilities import enforce, PUBLIC, AUTHENTICATED
+
+logger = logging.getLogger("auth-endpoints")
+logger.setLevel(logging.INFO)
+
+
+class AuthEndpoints:
+    """Groups the three auth-surface handlers.  Each forwards to the
+    IAM service via the existing ``IamRequestor`` dispatcher."""
+
+    def __init__(self, iam_dispatcher, auth):
+        self.iam = iam_dispatcher
+        self.auth = auth
+
+    async def start(self):
+        pass
+
+    def add_routes(self, app):
+        app.add_routes([
+            web.post("/api/v1/auth/login", self.login),
+            web.post("/api/v1/auth/bootstrap", self.bootstrap),
+            web.post(
+                "/api/v1/auth/bootstrap-status",
+                self.bootstrap_status,
+            ),
+            web.post(
+                "/api/v1/auth/change-password",
+                self.change_password,
+            ),
+        ])
+
+    async def _forward(self, body):
+        async def responder(x, fin):
+            pass
+        return await self.iam.process(body, responder)
+
+    async def login(self, request):
+        """Public.  Accepts {username, password, workspace?}.  Returns
+        {jwt, jwt_expires} on success; IAM's masked auth failure on
+        anything else."""
+        await enforce(request, self.auth, PUBLIC)
+        try:
+            body = await request.json()
+        except Exception:
+            return web.json_response(
+                {"error": "invalid json"}, status=400,
+            )
+        req = {
+            "operation": "login",
+            "username": body.get("username", ""),
+            "password": body.get("password", ""),
+            "workspace": body.get("workspace", ""),
+        }
+        resp = await self._forward(req)
+        if "error" in resp:
+            return web.json_response(
+                {"error": "auth failure"}, status=401,
+            )
+        return web.json_response(resp)
+
+    async def bootstrap(self, request):
+        """Public.  Valid only when IAM is running in bootstrap mode
+        with empty tables.  In every other case the IAM service
+        returns a masked auth-failure."""
+        await enforce(request, self.auth, PUBLIC)
+        resp = await self._forward({"operation": "bootstrap"})
+        if "error" in resp:
+            return web.json_response(
+                {"error": "auth failure"}, status=401,
+            )
+        return web.json_response(resp)
+
+    async def bootstrap_status(self, request):
+        """Public, side-effect-free.  Returns ``{"bootstrap_available":
+        bool}`` so a UI can decide whether to render first-run setup
+        without invoking the consuming ``bootstrap`` op."""
+        await enforce(request, self.auth, PUBLIC)
+        resp = await self._forward({"operation": "bootstrap-status"})
+        if "error" in resp:
+            return web.json_response(
+                {"error": "auth failure"}, status=401,
+            )
+        return web.json_response(resp)
+
+    async def change_password(self, request):
+        """Authenticated (any role).  Accepts {current_password,
+        new_password}; user_id is taken from the authenticated
+        identity — the caller cannot change someone else's password
+        this way (reset-password is the admin path)."""
+        identity = await enforce(request, self.auth, AUTHENTICATED)
+        try:
+            body = await request.json()
+        except Exception:
+            return web.json_response(
+                {"error": "invalid json"}, status=400,
+            )
+        req = {
+            "operation": "change-password",
+            "user_id": identity.handle,
+            "password": body.get("current_password", ""),
+            "new_password": body.get("new_password", ""),
+        }
+        resp = await self._forward(req)
+        if "error" in resp:
+            err_type = resp.get("error", {}).get("type", "")
+            if err_type == "auth-failed":
+                return web.json_response(
+                    {"error": "auth failure"}, status=401,
+                )
+            return web.json_response(
+                {"error": resp.get("error", {}).get("message", "error")},
+                status=400,
+            )
+        return web.json_response(resp)
--- a/trustgraph-flow/trustgraph/gateway/endpoint/constant_endpoint.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/constant_endpoint.py
@ -1,28 +1,27 @@

-import asyncio
-from aiohttp import web
-import uuid
 import logging

+from aiohttp import web
+
+from .. capabilities import enforce, enforce_workspace
+
 logger = logging.getLogger("endpoint")
 logger.setLevel(logging.INFO)

+
 class ConstantEndpoint:

-    def __init__(self, endpoint_path, auth, dispatcher):
+    def __init__(self, endpoint_path, auth, dispatcher, capability):

        self.path = endpoint_path
-
        self.auth = auth
-        self.operation = "service"
-
+        self.capability = capability
        self.dispatcher = dispatcher

    async def start(self):
        pass

    def add_routes(self, app):
-
        app.add_routes([
            web.post(self.path, self.handle),
        ])
@ -31,22 +30,14 @@ class ConstantEndpoint:

        logger.debug(f"Processing request: {request.path}")

-        try:
-            ht = request.headers["Authorization"]
-            tokens = ht.split(" ", 2)
-            if tokens[0] != "Bearer":
-                return web.HTTPUnauthorized()
-            token = tokens[1]
-        except:
-            token = ""
-
-        if not self.auth.permitted(token, self.operation):
-            return web.HTTPUnauthorized()
+        identity = await enforce(request, self.auth, self.capability)

        try:
-
            data = await request.json()

+            if identity is not None:
+                await enforce_workspace(data, identity, self.auth)
+
            async def responder(x, fin):
                pass

@ -54,10 +45,8 @@ class ConstantEndpoint:

            return web.json_response(resp)

+        except web.HTTPException:
+            raise
        except Exception as e:
-            logging.error(f"Exception: {e}")
-
-            return web.json_response(
-                { "error": str(e) }
-            )
-
+            logger.error(f"Exception: {e}", exc_info=True)
+            return web.json_response({"error": str(e)})
--- a/trustgraph-flow/trustgraph/gateway/endpoint/i18n.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/i18n.py
@ -4,16 +4,18 @@ from aiohttp import web

 from trustgraph.i18n import get_language_pack

+from .. capabilities import enforce
+
 logger = logging.getLogger("endpoint")
 logger.setLevel(logging.INFO)


 class I18nPackEndpoint:

-    def __init__(self, endpoint_path: str, auth):
+    def __init__(self, endpoint_path: str, auth, capability):
        self.path = endpoint_path
        self.auth = auth
-        self.operation = "service"
+        self.capability = capability

    async def start(self):
        pass
@ -26,26 +28,13 @@ class I18nPackEndpoint:
    async def handle(self, request):
        logger.debug(f"Processing i18n pack request: {request.path}")

-        token = ""
-        try:
-            ht = request.headers["Authorization"]
-            tokens = ht.split(" ", 2)
-            if tokens[0] != "Bearer":
-                return web.HTTPUnauthorized()
-            token = tokens[1]
-        except Exception:
-            token = ""
-
-        if not self.auth.permitted(token, self.operation):
-            return web.HTTPUnauthorized()
+        await enforce(request, self.auth, self.capability)

        lang = request.match_info.get("lang") or "en"

-        # This is a path traversal defense, and is a critical sec defense.
-        # Do not remove!
+        # Path-traversal defense — critical, do not remove.
        if "/" in lang or ".." in lang:
            return web.HTTPBadRequest(reason="Invalid language code")

        pack = get_language_pack(lang)
-
        return web.json_response(pack)
--- a/trustgraph-flow/trustgraph/gateway/endpoint/iam_endpoint.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/iam_endpoint.py
@ -0,0 +1,114 @@
+"""
+Registry-driven /api/v1/iam endpoint.
+
+The gateway no longer gates IAM management with a single coarse
+``users:admin`` capability.  Instead, each operation declares its
+own capability + resource shape in the registry (``registry.py``);
+this endpoint reads the body's ``operation`` field, looks up the
+declaration, and asks the IAM regime to authorise the call.
+
+Operations not in the registry produce a 400 ``unknown operation``.
+This is the gateway's primary mechanism for fail-closed gating of
+the IAM surface — the registry is the source of truth.
+"""
+
+import logging
+
+from aiohttp import web
+
+from .. capabilities import (
+    PUBLIC, AUTHENTICATED, auth_failure,
+)
+from .. registry import lookup, RequestContext
+
+logger = logging.getLogger("iam-endpoint")
+logger.setLevel(logging.INFO)
+
+
+class IamEndpoint:
+    """POST /api/v1/iam — generic forwarder gated by the operation
+    registry.  The IAM dispatcher (``iam_dispatcher``) forwards the
+    body verbatim to iam-svc once authorisation succeeds."""
+
+    def __init__(self, endpoint_path, auth, dispatcher):
+        self.path = endpoint_path
+        self.auth = auth
+        self.dispatcher = dispatcher
+
+    async def start(self):
+        pass
+
+    def add_routes(self, app):
+        app.add_routes([web.post(self.path, self.handle)])
+
+    async def handle(self, request):
+        try:
+            body = await request.json()
+        except Exception:
+            return web.json_response(
+                {"error": "invalid json"}, status=400,
+            )
+        if not isinstance(body, dict):
+            return web.json_response(
+                {"error": "body must be an object"}, status=400,
+            )
+
+        op_name = body.get("operation", "")
+        op = lookup(op_name)
+        if op is None:
+            return web.json_response(
+                {"error": "unknown operation"}, status=400,
+            )
+
+        # Authentication: required for everything except PUBLIC.
+        identity = None
+        if op.capability != PUBLIC:
+            try:
+                identity = await self.auth.authenticate(request)
+            except web.HTTPException:
+                raise
+
+        # Authorisation: capability sentinels short-circuit the
+        # regime call; capability strings go through authorise().
+        if op.capability not in (PUBLIC, AUTHENTICATED):
+            ctx = RequestContext(
+                body=body,
+                match_info=dict(request.match_info),
+                identity=identity,
+            )
+            try:
+                resource = op.extract_resource(ctx)
+                parameters = op.extract_parameters(ctx)
+            except Exception as e:
+                logger.warning(
+                    f"extractor failed for {op_name!r}: "
+                    f"{type(e).__name__}: {e}"
+                )
+                return web.json_response(
+                    {"error": "bad request"}, status=400,
+                )
+
+            await self.auth.authorise(
+                identity, op.capability, resource, parameters,
+            )
+
+        # Plumb the authenticated caller's handle through as ``actor``
+        # so iam-svc handlers (e.g. whoami, future actor-scoped
+        # checks) know who is making the request.  The gateway is
+        # the only authority for this — body-supplied ``actor``
+        # values are overwritten so callers can't impersonate.
+        if identity is not None:
+            body["actor"] = identity.handle
+
+        async def responder(x, fin):
+            pass
+
+        try:
+            resp = await self.dispatcher.process(body, responder)
+        except web.HTTPException:
+            raise
+        except Exception as e:
+            logger.error(f"Exception: {e}", exc_info=True)
+            return web.json_response({"error": str(e)})
+
+        return web.json_response(resp)
--- a/trustgraph-flow/trustgraph/gateway/endpoint/manager.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/manager.py
@ -8,72 +8,269 @@ from . variable_endpoint import VariableEndpoint
 from . socket import SocketEndpoint
 from . metrics import MetricsEndpoint
 from . i18n import I18nPackEndpoint
+from . auth_endpoints import AuthEndpoints
+from . iam_endpoint import IamEndpoint
+from . registry_endpoint import RegistryRoutedVariableEndpoint
+
+from .. capabilities import PUBLIC, AUTHENTICATED, auth_failure
+from .. registry import lookup as _registry_lookup, RequestContext

 from .. dispatch.manager import DispatcherManager

+
+# /api/v1/{kind} (config / flow / librarian / knowledge /
+# collection-management), /api/v1/iam, and /api/v1/flow/{flow}/...
+# routes are all gated per-operation by the registry, not by a
+# per-kind capability map.  Login / bootstrap / change-password are
+# served by AuthEndpoints with their own PUBLIC / AUTHENTICATED
+# sentinels.
+
+
+import logging as _mgr_logging
+_mgr_logger = _mgr_logging.getLogger("endpoint")
+
+
+class _RoutedVariableEndpoint:
+    """HTTP endpoint that gates per request via the operation
+    registry.  The URL's ``kind`` parameter combined with a fixed
+    ``registry_prefix`` yields the registry key — e.g. prefix
+    ``flow-service`` and kind ``agent`` looks up
+    ``flow-service:agent``.
+
+    Used for ``/api/v1/flow/{flow}/service/{kind}`` (per-flow
+    data-plane services).  ``/api/v1/{kind}`` (workspace-level
+    global services) goes through ``RegistryRoutedVariableEndpoint``
+    which discriminates on body operation as well as URL kind."""
+
+    def __init__(self, endpoint_path, auth, dispatcher, registry_prefix):
+        self.path = endpoint_path
+        self.auth = auth
+        self.dispatcher = dispatcher
+        self._registry_prefix = registry_prefix
+
+    async def start(self):
+        pass
+
+    def add_routes(self, app):
+        app.add_routes([web.post(self.path, self.handle)])
+
+    async def handle(self, request):
+        kind = request.match_info.get("kind", "")
+        op = _registry_lookup(f"{self._registry_prefix}:{kind}")
+        if op is None:
+            return web.json_response(
+                {"error": "unknown kind"}, status=404,
+            )
+
+        identity = await self.auth.authenticate(request)
+
+        try:
+            data = await request.json()
+            ctx = RequestContext(
+                body=data if isinstance(data, dict) else {},
+                match_info=dict(request.match_info),
+                identity=identity,
+            )
+            resource = op.extract_resource(ctx)
+            parameters = op.extract_parameters(ctx)
+            await self.auth.authorise(
+                identity, op.capability, resource, parameters,
+            )
+
+            async def responder(x, fin):
+                pass
+
+            resp = await self.dispatcher.process(
+                data, responder, request.match_info,
+            )
+            return web.json_response(resp)
+
+        except web.HTTPException:
+            raise
+        except Exception as e:
+            _mgr_logger.error(f"Exception: {e}", exc_info=True)
+            return web.json_response({"error": str(e)})
+
+
+class _RoutedSocketEndpoint:
+    """WebSocket endpoint gated per request via the operation
+    registry.  Like ``_RoutedVariableEndpoint`` but for the
+    streaming flow import / export socket paths."""
+
+    def __init__(self, endpoint_path, auth, dispatcher, registry_prefix):
+        self.path = endpoint_path
+        self.auth = auth
+        self.dispatcher = dispatcher
+        self._registry_prefix = registry_prefix
+
+    async def start(self):
+        pass
+
+    def add_routes(self, app):
+        app.add_routes([web.get(self.path, self.handle)])
+
+    async def handle(self, request):
+        kind = request.match_info.get("kind", "")
+        op = _registry_lookup(f"{self._registry_prefix}:{kind}")
+        if op is None:
+            return web.json_response(
+                {"error": "unknown kind"}, status=404,
+            )
+
+        token = request.query.get("token", "")
+        if not token:
+            return auth_failure()
+
+        from . socket import _QueryTokenRequest
+        try:
+            identity = await self.auth.authenticate(
+                _QueryTokenRequest(token)
+            )
+        except web.HTTPException as e:
+            return e
+
+        ctx = RequestContext(
+            body={},
+            match_info=dict(request.match_info),
+            identity=identity,
+        )
+        try:
+            resource = op.extract_resource(ctx)
+            parameters = op.extract_parameters(ctx)
+            await self.auth.authorise(
+                identity, op.capability, resource, parameters,
+            )
+        except web.HTTPException as e:
+            return e
+
+        # Delegate the websocket handling to a standalone SocketEndpoint
+        # with the resolved capability, bypassing the per-request mutation
+        # concern by instantiating fresh state.
+        ws_ep = SocketEndpoint(
+            endpoint_path=self.path,
+            auth=self.auth,
+            dispatcher=self.dispatcher,
+            capability=op.capability,
+        )
+        return await ws_ep.handle(request)
+
+
 class EndpointManager:

    def __init__(
-            self, dispatcher_manager, auth, prometheus_url, timeout=600
+            self, dispatcher_manager, auth, prometheus_url, timeout=600,
    ):

        self.dispatcher_manager = dispatcher_manager
        self.timeout = timeout

-        self.services = {
-        }
-
        self.endpoints = [
+
+            # Auth surface — public / authenticated-any.  Must come
+            # before the generic /api/v1/{kind} routes to win the
+            # match for /api/v1/auth/* paths.  aiohttp routes in
+            # registration order, so we prepend here.
+            AuthEndpoints(
+                iam_dispatcher=dispatcher_manager.dispatch_auth_iam(),
+                auth=auth,
+            ),
+
+            # /api/v1/iam — registry-driven IAM management.  Per
+            # operation gating happens inside IamEndpoint via the
+            # operation registry; the dispatcher forwards verbatim
+            # to iam-svc once authorisation has succeeded.  Listed
+            # before the generic /api/v1/{kind} route so it wins
+            # the match for "iam".
+            IamEndpoint(
+                endpoint_path="/api/v1/iam",
+                auth=auth,
+                dispatcher=dispatcher_manager.dispatch_auth_iam(),
+            ),
+
            I18nPackEndpoint(
-                endpoint_path = "/api/v1/i18n/packs/{lang}",
-                auth = auth,
+                endpoint_path="/api/v1/i18n/packs/{lang}",
+                auth=auth,
+                capability=PUBLIC,
            ),
            MetricsEndpoint(
-                endpoint_path = "/api/metrics",
-                prometheus_url = prometheus_url,
-                auth = auth,
+                endpoint_path="/api/metrics",
+                prometheus_url=prometheus_url,
+                auth=auth,
+                capability="metrics:read",
            ),
-            VariableEndpoint(
-                endpoint_path = "/api/v1/{kind}", auth = auth,
-                dispatcher = dispatcher_manager.dispatch_global_service(),
+
+            # Global services: registry-driven per-operation gating.
+            # Each kind+op combination has a registry entry that
+            # declares its capability and resource shape.  Listed
+            # after the IAM and auth-surface routes; aiohttp's
+            # path matcher prefers the more-specific path so this
+            # variable route doesn't shadow them.
+            RegistryRoutedVariableEndpoint(
+                endpoint_path="/api/v1/{kind}",
+                auth=auth,
+                dispatcher=dispatcher_manager.dispatch_global_service(),
            ),
+
+            # /api/v1/socket: WebSocket handshake accepts
+            # unconditionally; the Mux dispatcher runs the
+            # first-frame auth protocol.  Handshake-time 401s break
+            # browser reconnection, so authentication is always
+            # in-band for this endpoint.
            SocketEndpoint(
-                endpoint_path = "/api/v1/socket",
-                auth = auth,
-                dispatcher = dispatcher_manager.dispatch_socket()
+                endpoint_path="/api/v1/socket",
+                auth=auth,
+                dispatcher=dispatcher_manager.dispatch_socket(),
+                capability=AUTHENTICATED,  # informational only; bypassed
+                in_band_auth=True,
            ),
-            VariableEndpoint(
-                endpoint_path = "/api/v1/flow/{flow}/service/{kind}",
-                auth = auth,
-                dispatcher = dispatcher_manager.dispatch_flow_service(),
+
+            # Per-flow request/response services — gated per
+            # ``flow-service:<kind>`` registry entry.
+            _RoutedVariableEndpoint(
+                endpoint_path="/api/v1/flow/{flow}/service/{kind}",
+                auth=auth,
+                dispatcher=dispatcher_manager.dispatch_flow_service(),
+                registry_prefix="flow-service",
            ),
-            SocketEndpoint(
-                endpoint_path = "/api/v1/flow/{flow}/import/{kind}",
-                auth = auth,
-                dispatcher = dispatcher_manager.dispatch_flow_import()
+
+            # Per-flow streaming import/export — gated per
+            # ``flow-import:<kind>`` / ``flow-export:<kind>`` registry
+            # entry.
+            _RoutedSocketEndpoint(
+                endpoint_path="/api/v1/flow/{flow}/import/{kind}",
+                auth=auth,
+                dispatcher=dispatcher_manager.dispatch_flow_import(),
+                registry_prefix="flow-import",
            ),
-            SocketEndpoint(
-                endpoint_path = "/api/v1/flow/{flow}/export/{kind}",
-                auth = auth,
-                dispatcher = dispatcher_manager.dispatch_flow_export()
+            _RoutedSocketEndpoint(
+                endpoint_path="/api/v1/flow/{flow}/export/{kind}",
+                auth=auth,
+                dispatcher=dispatcher_manager.dispatch_flow_export(),
+                registry_prefix="flow-export",
+            ),
+
+            StreamEndpoint(
+                endpoint_path="/api/v1/import-core",
+                auth=auth,
+                method="POST",
+                dispatcher=dispatcher_manager.dispatch_core_import(),
+                # Cross-subject import — require the admin bundle via a
+                # single representative capability.
+                capability="users:admin",
            ),
            StreamEndpoint(
-                endpoint_path = "/api/v1/import-core",
-                auth = auth,
-                method = "POST",
-                dispatcher = dispatcher_manager.dispatch_core_import(),
+                endpoint_path="/api/v1/export-core",
+                auth=auth,
+                method="GET",
+                dispatcher=dispatcher_manager.dispatch_core_export(),
+                capability="users:admin",
            ),
            StreamEndpoint(
-                endpoint_path = "/api/v1/export-core",
-                auth = auth,
-                method = "GET",
-                dispatcher = dispatcher_manager.dispatch_core_export(),
-            ),
-            StreamEndpoint(
-                endpoint_path = "/api/v1/document-stream",
-                auth = auth,
-                method = "GET",
-                dispatcher = dispatcher_manager.dispatch_document_stream(),
+                endpoint_path="/api/v1/document-stream",
+                auth=auth,
+                method="GET",
+                dispatcher=dispatcher_manager.dispatch_document_stream(),
+                capability="documents:read",
            ),
        ]

@ -84,4 +281,3 @@ class EndpointManager:
    async def start(self):
        for ep in self.endpoints:
            await ep.start()
-
--- a/trustgraph-flow/trustgraph/gateway/endpoint/metrics.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/metrics.py
@ -10,17 +10,19 @@ import asyncio
 import uuid
 import logging

+from .. capabilities import enforce
+
 logger = logging.getLogger("endpoint")
 logger.setLevel(logging.INFO)

 class MetricsEndpoint:

-    def __init__(self, prometheus_url, endpoint_path, auth):
+    def __init__(self, prometheus_url, endpoint_path, auth, capability):

        self.prometheus_url = prometheus_url
        self.path = endpoint_path
        self.auth = auth
-        self.operation = "service"
+        self.capability = capability

    async def start(self):
        pass
@ -35,38 +37,39 @@ class MetricsEndpoint:

        logger.debug(f"Processing metrics request: {request.path}")

-        try:
-            ht = request.headers["Authorization"]
-            tokens = ht.split(" ", 2)
-            if tokens[0] != "Bearer":
-                return web.HTTPUnauthorized()
-            token = tokens[1]
-        except:
-            token = ""
+        await enforce(request, self.auth, self.capability)

-        if not self.auth.permitted(token, self.operation):
-            return web.HTTPUnauthorized()
+        path = request.match_info["path"]
+        url = (
+            self.prometheus_url + "/api/v1/" + path + "?" +
+            request.query_string
+        )

        try:

-            path = request.match_info["path"]
-
            async with aiohttp.ClientSession() as session:
-
-                url = (
-                    self.prometheus_url + "/api/v1/" + path + "?" +
-                    request.query_string
-                )
-
                async with session.get(url) as resp:
                    return web.Response(
                        status=resp.status,
                        text=await resp.text()
                    )

+        except aiohttp.ClientConnectionError as e:
+
+            # Upstream unreachable (connect refused, DNS failure,
+            # server disconnect).  Distinguish from our own errors so
+            # callers know where the fault is.
+            logger.error(f"Metrics upstream {url} unreachable: {e}")
+            return web.Response(
+                status=502,
+                text=f"Bad Gateway: metrics upstream unreachable: {e}",
+            )
+
        except Exception as e:

-            logging.error(f"Exception: {e}")
-
-            raise web.HTTPInternalServerError()
+            logger.error(f"Metrics proxy exception: {e}", exc_info=True)
+            return web.Response(
+                status=500,
+                text=f"Internal Server Error: {e}",
+            )

--- a/trustgraph-flow/trustgraph/gateway/endpoint/registry_endpoint.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/registry_endpoint.py
@ -0,0 +1,123 @@
+"""
+Registry-driven dispatch for ``/api/v1/{kind}`` global services.
+
+The body's ``operation`` field plus the URL's ``{kind}`` together
+form the canonical operation name (``<kind>:<operation>``) that the
+gateway looks up in ``registry.py``.  The matched operation
+declares its capability and resource shape; this endpoint asks the
+IAM regime to authorise the call before forwarding the body
+verbatim to the backend dispatcher.
+
+The dispatcher is the same ``dispatch_global_service()`` factory the
+old coarse path used; only the gating layer has changed.
+
+Operations not present in the registry are rejected with 400
+``unknown operation`` — fail closed.
+"""
+
+import logging
+
+from aiohttp import web
+
+from .. capabilities import (
+    PUBLIC, AUTHENTICATED, auth_failure,
+)
+from .. registry import lookup, RequestContext
+
+logger = logging.getLogger("registry-endpoint")
+logger.setLevel(logging.INFO)
+
+
+class RegistryRoutedVariableEndpoint:
+    """POST /api/v1/{kind} — kind comes from the URL, operation comes
+    from the body, both are joined as the registry key."""
+
+    def __init__(self, endpoint_path, auth, dispatcher):
+        self.path = endpoint_path
+        self.auth = auth
+        self.dispatcher = dispatcher
+
+    async def start(self):
+        pass
+
+    def add_routes(self, app):
+        app.add_routes([web.post(self.path, self.handle)])
+
+    async def handle(self, request):
+        kind = request.match_info.get("kind", "")
+        if not kind:
+            return web.json_response(
+                {"error": "missing kind"}, status=404,
+            )
+
+        try:
+            body = await request.json()
+        except Exception:
+            return web.json_response(
+                {"error": "invalid json"}, status=400,
+            )
+        if not isinstance(body, dict):
+            return web.json_response(
+                {"error": "body must be an object"}, status=400,
+            )
+
+        op_name = body.get("operation", "")
+        if not op_name:
+            return web.json_response(
+                {"error": "missing operation"}, status=400,
+            )
+
+        registry_key = f"{kind}:{op_name}"
+        op = lookup(registry_key)
+        if op is None:
+            return web.json_response(
+                {"error": "unknown operation"}, status=400,
+            )
+
+        identity = None
+        if op.capability != PUBLIC:
+            identity = await self.auth.authenticate(request)
+
+        if op.capability not in (PUBLIC, AUTHENTICATED):
+            ctx = RequestContext(
+                body=body,
+                match_info=dict(request.match_info),
+                identity=identity,
+            )
+            try:
+                resource = op.extract_resource(ctx)
+                parameters = op.extract_parameters(ctx)
+            except Exception as e:
+                logger.warning(
+                    f"extractor failed for {registry_key!r}: "
+                    f"{type(e).__name__}: {e}"
+                )
+                return web.json_response(
+                    {"error": "bad request"}, status=400,
+                )
+
+            await self.auth.authorise(
+                identity, op.capability, resource, parameters,
+            )
+
+            # Default-fill workspace into the body so downstream
+            # dispatchers see the canonical resolved value.  The
+            # extractor has already pulled the workspace out;
+            # mirror it back to the body for the verbatim forward.
+            if "workspace" in resource:
+                body["workspace"] = resource["workspace"]
+
+        async def responder(x, fin):
+            pass
+
+        try:
+            resp = await self.dispatcher.process(
+                body, responder, request.match_info,
+            )
+        except web.HTTPException:
+            raise
+        except Exception as e:
+            logger.error(f"Exception: {e}", exc_info=True)
+            return web.json_response({"error": str(e)})
+
+        return web.json_response(resp)
--- a/trustgraph-flow/trustgraph/gateway/endpoint/socket.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/socket.py
@ -4,6 +4,9 @@ from aiohttp import web, WSMsgType
 import logging

 from .. running import Running
+from .. capabilities import (
+    PUBLIC, AUTHENTICATED, auth_failure,
+)

 logger = logging.getLogger("socket")
 logger.setLevel(logging.INFO)
@ -11,12 +14,25 @@ logger.setLevel(logging.INFO)
 class SocketEndpoint:

    def __init__(
-            self, endpoint_path, auth, dispatcher,
+            self, endpoint_path, auth, dispatcher, capability,
+            in_band_auth=False,
    ):
+        """
+        ``in_band_auth=True`` skips the handshake-time auth check.
+        The WebSocket handshake always succeeds; the dispatcher is
+        expected to gate itself via the first-frame auth protocol
+        (see ``Mux``).
+
+        This avoids the browser problem where a 401 on the handshake
+        is treated as permanent and prevents reconnection, and lets
+        long-lived sockets refresh their credential mid-session by
+        sending a new auth frame.
+        """

        self.path = endpoint_path
        self.auth = auth
-        self.operation = "socket"
+        self.capability = capability
+        self.in_band_auth = in_band_auth

        self.dispatcher = dispatcher

@ -61,15 +77,33 @@ class SocketEndpoint:
            raise
        
    async def handle(self, request):
-        """Enhanced handler with better cleanup"""
-        try:
-            token = request.query['token']
-        except:
-            token = ""
+        """Enhanced handler with better cleanup.
+
+        Auth: WebSocket clients pass the bearer token on the
+        ``?token=...`` query string; we wrap it into a synthetic
+        Authorization header before delegating to the standard auth
+        path so the IAM-backed flow (JWT / API key) applies uniformly.
+        The first-frame auth protocol described in the IAM spec is
+        a future upgrade."""
+
+        if not self.in_band_auth and self.capability != PUBLIC:
+            token = request.query.get("token", "")
+            if not token:
+                return auth_failure()
+            try:
+                identity = await self.auth.authenticate(
+                    _QueryTokenRequest(token)
+                )
+            except web.HTTPException as e:
+                return e
+            if self.capability != AUTHENTICATED:
+                try:
+                    await self.auth.authorise(
+                        identity, self.capability, {}, {},
+                    )
+                except web.HTTPException as e:
+                    return e

-        if not self.auth.permitted(token, self.operation):
-            return web.HTTPUnauthorized()
-        
        # 50MB max message size
        ws = web.WebSocketResponse(max_msg_size=52428800)

@ -150,3 +184,11 @@ class SocketEndpoint:
            web.get(self.path, self.handle),
        ])

+
+class _QueryTokenRequest:
+    """Minimal shim that exposes headers["Authorization"] to
+    IamAuth.authenticate(), derived from a query-string token."""
+
+    def __init__(self, token):
+        self.headers = {"Authorization": f"Bearer {token}"}
+
--- a/trustgraph-flow/trustgraph/gateway/endpoint/stream_endpoint.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/stream_endpoint.py
@ -1,82 +1,64 @@

-import asyncio
-from aiohttp import web
 import logging

+from aiohttp import web
+
+from .. capabilities import enforce
+
 logger = logging.getLogger("endpoint")
 logger.setLevel(logging.INFO)

+
 class StreamEndpoint:

-    def __init__(self, endpoint_path, auth, dispatcher, method="POST"):
-
+    def __init__(
+            self, endpoint_path, auth, dispatcher, capability, method="POST",
+    ):
        self.path = endpoint_path
-
        self.auth = auth
-        self.operation = "service"
+        self.capability = capability
        self.method = method
-
        self.dispatcher = dispatcher

    async def start(self):
        pass

    def add_routes(self, app):
-
        if self.method == "POST":
-            app.add_routes([
-                web.post(self.path, self.handle),
-            ])
+            app.add_routes([web.post(self.path, self.handle)])
        elif self.method == "GET":
-            app.add_routes([
-                web.get(self.path, self.handle),
-            ])
+            app.add_routes([web.get(self.path, self.handle)])
        else:
-            raise RuntimeError("Bad method" + self.method)
+            raise RuntimeError("Bad method " + self.method)

    async def handle(self, request):

        logger.debug(f"Processing request: {request.path}")

-        try:
-            ht = request.headers["Authorization"]
-            tokens = ht.split(" ", 2)
-            if tokens[0] != "Bearer":
-                return web.HTTPUnauthorized()
-            token = tokens[1]
-        except:
-            token = ""
-
-        if not self.auth.permitted(token, self.operation):
-            return web.HTTPUnauthorized()
+        await enforce(request, self.auth, self.capability)

        try:
-
            data = request.content

            async def error(err):
-                return web.HTTPInternalServerError(text = err)
+                return web.HTTPInternalServerError(text=err)

            async def ok(
-                    status=200, reason="OK", type="application/octet-stream"
+                    status=200, reason="OK",
+                    type="application/octet-stream",
            ):
                response = web.StreamResponse(
-                    status = status, reason = reason,
-                    headers = {"Content-Type": type}
+                    status=status, reason=reason,
+                    headers={"Content-Type": type},
                )
                await response.prepare(request)
                return response

-            resp = await self.dispatcher.process(
-                data, error, ok, request
-            )
-
+            resp = await self.dispatcher.process(data, error, ok, request)
            return resp

+        except web.HTTPException:
+            raise
        except Exception as e:
-            logging.error(f"Exception: {e}")
-
-            return web.json_response(
-                { "error": str(e) }
-            )
-
+            logger.error(f"Exception: {e}", exc_info=True)
+            return web.json_response({"error": str(e)})
--- a/trustgraph-flow/trustgraph/gateway/endpoint/variable_endpoint.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/variable_endpoint.py
@ -1,27 +1,27 @@

-import asyncio
-from aiohttp import web
 import logging

+from aiohttp import web
+
+from .. capabilities import enforce, enforce_workspace
+
 logger = logging.getLogger("endpoint")
 logger.setLevel(logging.INFO)

+
 class VariableEndpoint:

-    def __init__(self, endpoint_path, auth, dispatcher):
+    def __init__(self, endpoint_path, auth, dispatcher, capability):

        self.path = endpoint_path
-
        self.auth = auth
-        self.operation = "service"
-
+        self.capability = capability
        self.dispatcher = dispatcher

    async def start(self):
        pass

    def add_routes(self, app):
-
        app.add_routes([
            web.post(self.path, self.handle),
        ])
@ -30,35 +30,25 @@ class VariableEndpoint:

        logger.debug(f"Processing request: {request.path}")

-        try:
-            ht = request.headers["Authorization"]
-            tokens = ht.split(" ", 2)
-            if tokens[0] != "Bearer":
-                return web.HTTPUnauthorized()
-            token = tokens[1]
-        except:
-            token = ""
-
-        if not self.auth.permitted(token, self.operation):
-            return web.HTTPUnauthorized()
+        identity = await enforce(request, self.auth, self.capability)

        try:
-
            data = await request.json()

+            if identity is not None:
+                await enforce_workspace(data, identity, self.auth)
+
            async def responder(x, fin):
                pass

            resp = await self.dispatcher.process(
-                data, responder, request.match_info
+                data, responder, request.match_info,
            )

            return web.json_response(resp)

+        except web.HTTPException:
+            raise
        except Exception as e:
-            logging.error(f"Exception: {e}")
-
-            return web.json_response(
-                { "error": str(e) }
-            )
-
+            logger.error(f"Exception: {e}", exc_info=True)
+            return web.json_response({"error": str(e)})
--- a/trustgraph-flow/trustgraph/gateway/registry.py
+++ b/trustgraph-flow/trustgraph/gateway/registry.py
@ -0,0 +1,533 @@
+"""
+Gateway operation registry.
+
+Single declarative table mapping each operation the gateway
+recognises to:
+
+- The capability the IAM regime is asked to authorise against.
+- The resource level (system / workspace / flow) — determines the
+  shape of the resource identifier handed to ``authorise``.
+- Extractors that build the resource and parameters from the
+  request context.
+
+This is a gateway-internal concept.  It is not part of the IAM
+contract — the contract specifies what arguments ``authorise``
+receives; the registry is how the gateway populates them.
+
+See docs/tech-specs/iam-contract.md for the contract and
+docs/tech-specs/iam.md for the request anatomy.
+"""
+
+from dataclasses import dataclass, field
+from typing import Any, Callable
+
+
+# Sentinels for operations that don't go through capability-based
+# authorisation.  Mirror the values used in capabilities.py so the
+# gateway endpoint layer can recognise them uniformly.
+PUBLIC = "__public__"
+AUTHENTICATED = "__authenticated__"
+
+
+class ResourceLevel:
+    """Where the operation's resource lives.
+
+    ``SYSTEM``    — operation acts on a deployment-level resource
+                    (the user registry, the workspace registry,
+                    the signing key).  resource = {}.  Workspace,
+                    if relevant, is a parameter, not an address.
+
+    ``WORKSPACE`` — operation acts on something within a workspace
+                    (config, library, knowledge, collections, flow
+                    lifecycle).  resource = {workspace}.
+
+    ``FLOW``      — operation acts on something within a flow
+                    within a workspace (graph, agent, llm, etc.).
+                    resource = {workspace, flow}.
+    """
+    SYSTEM = "system"
+    WORKSPACE = "workspace"
+    FLOW = "flow"
+
+
+@dataclass
+class RequestContext:
+    """The bundle of inputs the registry's extractors operate on.
+    Assembled by the gateway from the incoming request after
+    authentication."""
+
+    # Parsed JSON body (HTTP) or inner request payload (WebSocket).
+    body: dict = field(default_factory=dict)
+
+    # URL path components (HTTP) or WebSocket envelope routing
+    # fields (id, service, workspace, flow).
+    match_info: dict = field(default_factory=dict)
+
+    # Authenticated identity for default-fill-in.  Always present
+    # by the time extractors run, except for PUBLIC operations
+    # where it is None.
+    identity: Any = None
+
+
+@dataclass
+class Operation:
+    """Declared operation the gateway can dispatch + authorise."""
+
+    # Canonical operation name (used for registry lookup, audit,
+    # debug logs).  Mirrors the operation strings in the IAM
+    # service and other backends where applicable.
+    name: str
+
+    # Capability required to invoke this operation.  Either a
+    # string from the capability vocabulary in capabilities.md, or
+    # the PUBLIC / AUTHENTICATED sentinel for operations that
+    # don't go through capability-based authorisation.
+    capability: str
+
+    # Where the operation's resource lives.  Determines the
+    # shape of the resource argument passed to authorise.
+    resource_level: str
+
+    # Build the resource identifier from the request context.
+    # Returns a dict with the appropriate components for the
+    # resource level: {} for SYSTEM, {workspace} for WORKSPACE,
+    # {workspace, flow} for FLOW.  Default-fill-in of workspace
+    # from identity.workspace happens here when applicable.
+    extract_resource: Callable[[RequestContext], dict]
+
+    # Build the parameters dict — decision-relevant fields the
+    # operation supplied that are not part of the resource
+    # address.  E.g. workspace association on a system-level
+    # user-registry operation.
+    extract_parameters: Callable[[RequestContext], dict]
+
+
+# ---------------------------------------------------------------------------
+# Registry storage.
+# ---------------------------------------------------------------------------
+
+
+_REGISTRY: dict[str, Operation] = {}
+
+
+def register(op: Operation) -> None:
+    if op.name in _REGISTRY:
+        raise RuntimeError(
+            f"operation {op.name!r} already registered"
+        )
+    _REGISTRY[op.name] = op
+
+
+def lookup(name: str) -> Operation | None:
+    return _REGISTRY.get(name)
+
+
+def all_operations() -> list[Operation]:
+    return list(_REGISTRY.values())
+
+
+# ---------------------------------------------------------------------------
+# Common extractor helpers.
+# ---------------------------------------------------------------------------
+
+
+def _empty_resource(_ctx: RequestContext) -> dict:
+    """System-level resource: empty dict."""
+    return {}
+
+
+def _workspace_from_body(ctx: RequestContext) -> dict:
+    """Workspace-level resource sourced from the request body's
+    workspace field, defaulting to the caller's bound workspace."""
+    ws = (ctx.body.get("workspace") if isinstance(ctx.body, dict) else "")
+    if not ws and ctx.identity is not None:
+        ws = ctx.identity.workspace
+    return {"workspace": ws}
+
+
+def _flow_from_match_info(ctx: RequestContext) -> dict:
+    """Flow-level resource sourced from URL path components or WS
+    envelope fields.  Both ``workspace`` and ``flow`` are required;
+    no default-fill-in (the address is the operation's identity)."""
+    return {
+        "workspace": ctx.match_info.get("workspace", ""),
+        "flow": ctx.match_info.get("flow", ""),
+    }
+
+
+def _no_parameters(_ctx: RequestContext) -> dict:
+    return {}
+
+
+def _body_as_parameters(ctx: RequestContext) -> dict:
+    """All body fields are parameters — used when the operation's
+    body is small and uniformly decision-relevant (e.g. user-
+    registry ops where the body's user.workspace is what the
+    regime checks against the admin's scope)."""
+    return dict(ctx.body) if isinstance(ctx.body, dict) else {}
+
+
+def _workspace_param_only(ctx: RequestContext) -> dict:
+    """Parameters dict carrying only the workspace association.
+    Used by system-level operations (e.g. user-registry ops) where
+    the workspace isn't part of the resource address but is the
+    field the regime uses to scope the admin's authority.
+
+    Pulls the workspace from the inner ``user`` / ``workspace_record``
+    body field if present (create-user, create-workspace), then from
+    the top-level body, then from the caller's bound workspace."""
+    body = ctx.body if isinstance(ctx.body, dict) else {}
+    inner_user = body.get("user") if isinstance(body.get("user"), dict) else {}
+    inner_ws = (
+        body.get("workspace_record")
+        if isinstance(body.get("workspace_record"), dict) else {}
+    )
+    ws = (
+        inner_user.get("workspace")
+        or inner_ws.get("id")
+        or body.get("workspace")
+    )
+    if not ws and ctx.identity is not None:
+        ws = ctx.identity.workspace
+    return {"workspace": ws or ""}
+
+
+# ---------------------------------------------------------------------------
+# Operation registrations.
+#
+# The gateway looks operations up by their canonical name (the same
+# string the request body / WS envelope carries in its ``operation``
+# field where applicable).  Auth-surface operations (login, bootstrap,
+# change-password) are not listed here — they have their own routes
+# in auth_endpoints.py and use PUBLIC / AUTHENTICATED sentinels
+# directly.  Pure gateway↔IAM internal operations (resolve-api-key,
+# authorise, authorise-many, get-signing-key-public) are likewise
+# excluded; they are never invoked over the public API.
+# ---------------------------------------------------------------------------
+
+
+# IAM management operations.  All routed through /api/v1/iam, body
+# carries ``operation`` plus operation-specific fields.
+
+# User registry: SYSTEM-level resource (users are global, identified
+# by handle).  The admin's authority is scoped per workspace via the
+# parameters {workspace} field — that's what the regime checks
+# against the admin's role workspace_scope.
+register(Operation(
+    name="create-user",
+    capability="users:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="list-users",
+    capability="users:read",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="get-user",
+    capability="users:read",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="update-user",
+    capability="users:write",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="disable-user",
+    capability="users:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="enable-user",
+    capability="users:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="delete-user",
+    capability="users:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="reset-password",
+    capability="users:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+
+
+# API keys: SYSTEM-level resource — like users, a key record exists
+# in the deployment-wide keys registry.  The workspace the key
+# authenticates to is a property of the record, not a containment;
+# it appears as a parameter so the regime can scope the admin's
+# authority to issue / list / revoke against it.
+register(Operation(
+    name="create-api-key",
+    capability="keys:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="list-api-keys",
+    capability="keys:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+register(Operation(
+    name="revoke-api-key",
+    capability="keys:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_workspace_param_only,
+))
+
+
+# Workspace registry: SYSTEM-level resource (workspaces are the
+# top-level addressable unit).  No parameters — the workspace being
+# acted on is identified by the body, not used as a scope cue.
+register(Operation(
+    name="create-workspace",
+    capability="workspaces:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+register(Operation(
+    name="list-workspaces",
+    capability="workspaces:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+register(Operation(
+    name="get-workspace",
+    capability="workspaces:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+register(Operation(
+    name="update-workspace",
+    capability="workspaces:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+register(Operation(
+    name="disable-workspace",
+    capability="workspaces:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+
+
+# Signing key: SYSTEM-level operational op.
+register(Operation(
+    name="rotate-signing-key",
+    capability="iam:admin",
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+
+
+# ---------------------------------------------------------------------------
+# Auth-surface entries.
+#
+# Listed here so the registry is the one place the gateway looks for
+# operation→capability mappings — including the sentinels for paths
+# that don't go through capability-based authorisation.  The actual
+# routing is in auth_endpoints.py; these entries let the registry-
+# driven dispatcher recognise the operation if it sees it on a
+# generic path.
+# ---------------------------------------------------------------------------
+
+register(Operation(
+    name="login",
+    capability=PUBLIC,
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+register(Operation(
+    name="bootstrap",
+    capability=PUBLIC,
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+register(Operation(
+    name="bootstrap-status",
+    capability=PUBLIC,
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+register(Operation(
+    name="change-password",
+    capability=AUTHENTICATED,
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+register(Operation(
+    name="whoami",
+    capability=AUTHENTICATED,
+    resource_level=ResourceLevel.SYSTEM,
+    extract_resource=_empty_resource,
+    extract_parameters=_no_parameters,
+))
+
+
+# ---------------------------------------------------------------------------
+# Generic kind/operation entries.
+#
+# Names are ``<kind>:<operation>`` so the registry key is unique
+# across dispatchers.  All entries below are workspace-level
+# resources (workspace defaulted from the caller's bound workspace
+# if absent).  Read/write distinction maps to the existing
+# ``<subject>:read`` / ``<subject>:write`` capability vocabulary
+# defined in capabilities.md.
+# ---------------------------------------------------------------------------
+
+
+def _register_kind_op(kind: str, op: str, capability: str) -> None:
+    """Helper: register a workspace-level kind:op with the standard
+    extractors (workspace from body, no extra parameters)."""
+    register(Operation(
+        name=f"{kind}:{op}",
+        capability=capability,
+        resource_level=ResourceLevel.WORKSPACE,
+        extract_resource=_workspace_from_body,
+        extract_parameters=_no_parameters,
+    ))
+
+
+# config: KV-style workspace config service.
+for _op in ("get", "list", "getvalues", "getvalues-all-ws", "config"):
+    _register_kind_op("config", _op, "config:read")
+for _op in ("put", "delete"):
+    _register_kind_op("config", _op, "config:write")
+
+
+# flow: flow-blueprint and flow-lifecycle service.
+for _op in ("list-blueprints", "get-blueprint", "list-flows", "get-flow"):
+    _register_kind_op("flow", _op, "flows:read")
+for _op in ("put-blueprint", "delete-blueprint", "start-flow", "stop-flow"):
+    _register_kind_op("flow", _op, "flows:write")
+
+
+# librarian: document storage and processing service.
+for _op in (
+    "get-document-metadata", "get-document-content",
+    "stream-document", "list-documents", "list-processing",
+    "get-upload-status", "list-uploads",
+):
+    _register_kind_op("librarian", _op, "documents:read")
+for _op in (
+    "add-document", "remove-document", "update-document",
+    "add-processing", "remove-processing",
+    "begin-upload", "upload-chunk", "complete-upload", "abort-upload",
+):
+    _register_kind_op("librarian", _op, "documents:write")
+
+
+# knowledge: knowledge-graph core service.
+for _op in ("get-kg-core", "list-kg-cores"):
+    _register_kind_op("knowledge", _op, "knowledge:read")
+for _op in ("put-kg-core", "delete-kg-core",
+            "load-kg-core", "unload-kg-core"):
+    _register_kind_op("knowledge", _op, "knowledge:write")
+
+
+# collection-management: workspace collection lifecycle.
+_register_kind_op("collection-management", "list-collections", "collections:read")
+for _op in ("update-collection", "delete-collection"):
+    _register_kind_op("collection-management", _op, "collections:write")
+
+
+# ---------------------------------------------------------------------------
+# Per-flow data-plane services.
+#
+# /api/v1/flow/{flow}/service/{kind} and the streaming
+# /api/v1/flow/{flow}/{import,export}/{kind} paths.  No body-level
+# ``operation`` discriminator — the URL kind is the operation
+# identity.  Resource is FLOW level (workspace + flow).
+#
+# Names: ``flow-service:<kind>``, ``flow-import:<kind>``,
+# ``flow-export:<kind>``.
+# ---------------------------------------------------------------------------
+
+
+def _register_flow_kind(prefix: str, kind: str, capability: str) -> None:
+    register(Operation(
+        name=f"{prefix}:{kind}",
+        capability=capability,
+        resource_level=ResourceLevel.FLOW,
+        extract_resource=_flow_from_match_info,
+        extract_parameters=_no_parameters,
+    ))
+
+
+# Request/response services on /api/v1/flow/{flow}/service/{kind}.
+_FLOW_SERVICES = {
+    "agent": "agent",
+    "text-completion": "llm",
+    "prompt": "llm",
+    "mcp-tool": "mcp",
+    "graph-rag": "graph:read",
+    "document-rag": "documents:read",
+    "embeddings": "embeddings",
+    "graph-embeddings": "graph:read",
+    "document-embeddings": "documents:read",
+    "triples": "graph:read",
+    "rows": "rows:read",
+    "nlp-query": "rows:read",
+    "structured-query": "rows:read",
+    "structured-diag": "rows:read",
+    "row-embeddings": "rows:read",
+    "sparql": "graph:read",
+}
+for _kind, _cap in _FLOW_SERVICES.items():
+    _register_flow_kind("flow-service", _kind, _cap)
+
+
+# Streaming import socket endpoints.
+_FLOW_IMPORTS = {
+    "triples": "graph:write",
+    "graph-embeddings": "graph:write",
+    "document-embeddings": "documents:write",
+    "entity-contexts": "documents:write",
+    "rows": "rows:write",
+}
+for _kind, _cap in _FLOW_IMPORTS.items():
+    _register_flow_kind("flow-import", _kind, _cap)
+
+
+# Streaming export socket endpoints.
+_FLOW_EXPORTS = {
+    "triples": "graph:read",
+    "graph-embeddings": "graph:read",
+    "document-embeddings": "documents:read",
+    "entity-contexts": "documents:read",
+}
+for _kind, _cap in _FLOW_EXPORTS.items():
+    _register_flow_kind("flow-export", _kind, _cap)
--- a/trustgraph-flow/trustgraph/gateway/service.py
+++ b/trustgraph-flow/trustgraph/gateway/service.py
@ -12,7 +12,7 @@ import os
 from trustgraph.base.logging import setup_logging, add_logging_args
 from trustgraph.base.pubsub import get_pubsub, add_pubsub_args

-from . auth import Authenticator
+from . auth import IamAuth
 from . config.receiver import ConfigReceiver
 from . dispatch.manager import DispatcherManager

@ -35,7 +35,6 @@ default_prometheus_url = os.getenv("PROMETHEUS_URL", "http://prometheus:9090")
 default_pulsar_api_key = os.getenv("PULSAR_API_KEY", None)
 default_timeout = 600
 default_port = 8088
-default_api_token = os.getenv("GATEWAY_SECRET", "")

 class Api:

@ -60,13 +59,14 @@ class Api:
        if not self.prometheus_url.endswith("/"):
            self.prometheus_url += "/"

-        api_token = config.get("api_token", default_api_token)
-
-        # Token not set, or token equal empty string means no auth
-        if api_token:
-            self.auth = Authenticator(token=api_token)
-        else:
-            self.auth = Authenticator(allow_all=True)
+        # IAM-backed authentication.  The legacy GATEWAY_SECRET
+        # shared-token path has been removed — there is no
+        # "open for everyone" fallback.  The gateway cannot
+        # authenticate any request until IAM is reachable.
+        self.auth = IamAuth(
+            backend=self.pubsub_backend,
+            id=config.get("id", "api-gateway"),
+        )

        self.config_receiver = ConfigReceiver(self.pubsub_backend)

@ -118,6 +118,7 @@ class Api:
            config_receiver = self.config_receiver,
            prefix = "gateway",
            queue_overrides = queue_overrides,
+            auth = self.auth,
        )

        self.endpoint_manager = EndpointManager(
@ -132,12 +133,18 @@ class Api:
        ]

    async def app_factory(self):
-        
+
        self.app = web.Application(
            middlewares=[],
            client_max_size=256 * 1024 * 1024
        )

+        # Fetch IAM signing public key before accepting traffic.
+        # Blocks for a bounded retry window; the gateway starts even
+        # if IAM is still unreachable (JWT validation will 401 until
+        # the key is available).
+        await self.auth.start()
+
        await self.config_receiver.start()

        for ep in self.endpoints:
@ -189,12 +196,6 @@ def run():
        help=f'API request timeout in seconds (default: {default_timeout})',
    )

-    parser.add_argument(
-        '--api-token',
-        default=default_api_token,
-        help=f'Secret API token (default: no auth)',
-    )
-
    add_logging_args(parser)

    parser.add_argument(
--- a/trustgraph-flow/trustgraph/iam/init.py
+++ b/trustgraph-flow/trustgraph/iam/init.py
--- a/trustgraph-flow/trustgraph/iam/service/init.py
+++ b/trustgraph-flow/trustgraph/iam/service/init.py
@ -0,0 +1 @@
+from . service import *
--- a/trustgraph-flow/trustgraph/iam/service/main.py
+++ b/trustgraph-flow/trustgraph/iam/service/main.py
@ -0,0 +1,4 @@
+
+from . service import run
+
+run()
--- a/trustgraph-flow/trustgraph/iam/service/iam.py
+++ b/trustgraph-flow/trustgraph/iam/service/iam.py
--- a/trustgraph-flow/trustgraph/iam/service/service.py
+++ b/trustgraph-flow/trustgraph/iam/service/service.py
@ -0,0 +1,233 @@
+"""
+IAM service processor.  Terminates the IAM request queue and forwards
+each request to the IamService business logic, then returns the
+response on the IAM response queue.
+
+Shape mirrors trustgraph.config.service.
+"""
+
+import logging
+import os
+
+from trustgraph.schema import Error
+from trustgraph.schema import IamRequest, IamResponse
+from trustgraph.schema import iam_request_queue, iam_response_queue
+
+from trustgraph.base import AsyncProcessor, Consumer, Producer
+from trustgraph.base import ConsumerMetrics, ProducerMetrics
+from trustgraph.base.cassandra_config import (
+    add_cassandra_args, resolve_cassandra_config,
+)
+
+from . iam import IamService
+
+logger = logging.getLogger(__name__)
+
+default_ident = "iam-svc"
+
+default_iam_request_queue = iam_request_queue
+default_iam_response_queue = iam_response_queue
+
+# Environment variables consulted as a fallback when the
+# corresponding params field is not set in the processor-group YAML
+# or via CLI.  Intended for K8s Secret / env-var injection so the
+# bootstrap token never has to live in the YAML (and thus in git).
+ENV_BOOTSTRAP_MODE = "IAM_BOOTSTRAP_MODE"
+ENV_BOOTSTRAP_TOKEN = "IAM_BOOTSTRAP_TOKEN"
+
+
+class Processor(AsyncProcessor):
+
+    def __init__(self, **params):
+
+        iam_req_q = params.get(
+            "iam_request_queue", default_iam_request_queue,
+        )
+        iam_resp_q = params.get(
+            "iam_response_queue", default_iam_response_queue,
+        )
+
+        # Resolve bootstrap mode + token.  Precedence: explicit
+        # params (CLI / processor-group YAML) → environment variable
+        # → unset (fail-closed).  The env-var path is the K8s-native
+        # injection point: an `IAM_BOOTSTRAP_TOKEN` from a Secret
+        # never has to land in the YAML, and therefore never enters
+        # git history.
+        bootstrap_mode = (
+            params.get("bootstrap_mode")
+            or os.environ.get(ENV_BOOTSTRAP_MODE)
+        )
+        bootstrap_token = (
+            params.get("bootstrap_token")
+            or os.environ.get(ENV_BOOTSTRAP_TOKEN)
+        )
+
+        if bootstrap_mode not in ("token", "bootstrap"):
+            raise RuntimeError(
+                "iam-svc: bootstrap-mode is required.  Set to 'token' "
+                "(with bootstrap-token) for production, or 'bootstrap' "
+                "to enable the explicit bootstrap operation over the "
+                "pub/sub bus (dev / quick-start only, not safe under "
+                "public exposure).  Configurable via processor-group "
+                f"params or the {ENV_BOOTSTRAP_MODE} environment "
+                "variable.  Refusing to start."
+            )
+        if bootstrap_mode == "token" and not bootstrap_token:
+            raise RuntimeError(
+                "iam-svc: bootstrap-mode=token requires bootstrap-token "
+                f"(or the {ENV_BOOTSTRAP_TOKEN} environment "
+                "variable).  Refusing to start."
+            )
+        if bootstrap_mode == "bootstrap" and bootstrap_token:
+            raise RuntimeError(
+                "iam-svc: bootstrap-token is not accepted when "
+                "bootstrap-mode=bootstrap.  Ambiguous intent.  "
+                "Refusing to start."
+            )
+
+        self.bootstrap_mode = bootstrap_mode
+        self.bootstrap_token = bootstrap_token
+
+        cassandra_host = params.get("cassandra_host")
+        cassandra_username = params.get("cassandra_username")
+        cassandra_password = params.get("cassandra_password")
+
+        hosts, username, password, keyspace = resolve_cassandra_config(
+            host=cassandra_host,
+            username=cassandra_username,
+            password=cassandra_password,
+            default_keyspace="iam",
+        )
+
+        self.cassandra_host = hosts
+        self.cassandra_username = username
+        self.cassandra_password = password
+
+        super().__init__(
+            **params | {
+                "iam_request_schema": IamRequest.__name__,
+                "iam_response_schema": IamResponse.__name__,
+                "cassandra_host": self.cassandra_host,
+                "cassandra_username": self.cassandra_username,
+                "cassandra_password": self.cassandra_password,
+            }
+        )
+
+        iam_request_metrics = ConsumerMetrics(
+            processor=self.id, flow=None, name="iam-request",
+        )
+        iam_response_metrics = ProducerMetrics(
+            processor=self.id, flow=None, name="iam-response",
+        )
+
+        self.iam_request_topic = iam_req_q
+
+        self.iam_request_consumer = Consumer(
+            taskgroup=self.taskgroup,
+            backend=self.pubsub,
+            flow=None,
+            topic=iam_req_q,
+            subscriber=self.id,
+            schema=IamRequest,
+            handler=self.on_iam_request,
+            metrics=iam_request_metrics,
+        )
+
+        self.iam_response_producer = Producer(
+            backend=self.pubsub,
+            topic=iam_resp_q,
+            schema=IamResponse,
+            metrics=iam_response_metrics,
+        )
+
+        self.iam = IamService(
+            host=self.cassandra_host,
+            username=self.cassandra_username,
+            password=self.cassandra_password,
+            keyspace=keyspace,
+            bootstrap_mode=self.bootstrap_mode,
+            bootstrap_token=self.bootstrap_token,
+        )
+
+        logger.info(
+            f"IAM service initialised (bootstrap-mode={self.bootstrap_mode})"
+        )
+
+    async def start(self):
+        await self.pubsub.ensure_topic(self.iam_request_topic)
+        # Token-mode auto-bootstrap runs before we accept requests so
+        # the first inbound call always sees a populated table.
+        await self.iam.auto_bootstrap_if_token_mode()
+        await self.iam_request_consumer.start()
+
+    async def on_iam_request(self, msg, consumer, flow):
+
+        id = None
+        try:
+            v = msg.value()
+            id = msg.properties()["id"]
+            logger.debug(
+                f"Handling IAM request {id} op={v.operation!r}"
+            )
+            resp = await self.iam.handle(v)
+            await self.iam_response_producer.send(
+                resp, properties={"id": id},
+            )
+        except Exception as e:
+            logger.error(
+                f"IAM request failed: {type(e).__name__}: {e}",
+                exc_info=True,
+            )
+            resp = IamResponse(
+                error=Error(type="internal-error", message=str(e)),
+            )
+            if id is not None:
+                await self.iam_response_producer.send(
+                    resp, properties={"id": id},
+                )
+
+    @staticmethod
+    def add_args(parser):
+        AsyncProcessor.add_args(parser)
+
+        parser.add_argument(
+            "--iam-request-queue",
+            default=default_iam_request_queue,
+            help=f"IAM request queue (default: {default_iam_request_queue})",
+        )
+        parser.add_argument(
+            "--iam-response-queue",
+            default=default_iam_response_queue,
+            help=f"IAM response queue (default: {default_iam_response_queue})",
+        )
+        parser.add_argument(
+            "--bootstrap-mode",
+            default=None,
+            choices=["token", "bootstrap"],
+            help=(
+                "IAM bootstrap mode (required).  "
+                "'token' = operator supplies the initial admin API "
+                "key via --bootstrap-token; auto-seeds on first start, "
+                "bootstrap operation refused.  "
+                "'bootstrap' = bootstrap operation is live over the "
+                "bus until tables are populated; a token is generated "
+                "and returned by tg-bootstrap-iam.  Unsafe to run "
+                "'bootstrap' mode with public exposure."
+            ),
+        )
+        parser.add_argument(
+            "--bootstrap-token",
+            default=None,
+            help=(
+                "Initial admin API key plaintext, required when "
+                "--bootstrap-mode=token.  Treat as a one-time "
+                "credential: the operator should rotate to a new key "
+                "and revoke this one after first use."
+            ),
+        )
+
+        add_cassandra_args(parser)
+
+
+def run():
+    Processor.launch(default_ident, __doc__)
--- a/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py
@ -4,7 +4,7 @@ Simple LLM service, performs text prompt completion using an Ollama service.
 Input is prompt, output is response.
 """

-from ollama import Client
+from ollama import AsyncClient
 import os
 import logging

@ -38,23 +38,23 @@ class Processor(LlmService):

        self.default_model = model
        self.temperature = temperature
-        self.llm = Client(host=ollama)
+        self.llm = AsyncClient(host=ollama)
        self._checked_models = set()

-    def _ensure_model(self, model_name):
+    async def _ensure_model(self, model_name):
        """Check if model exists locally, pull it if not."""
        if model_name in self._checked_models:
            return

        try:
-            self.llm.show(model_name)
+            await self.llm.show(model_name)
            self._checked_models.add(model_name)
        except Exception as e:
            status_code = getattr(e, 'status_code', None)
            if status_code == 404 or "not found" in str(e).lower():
                logger.info(f"Ollama model '{model_name}' not found locally. Pulling, this may take a while...")
                try:
-                    self.llm.pull(model_name)
+                    await self.llm.pull(model_name)
                    self._checked_models.add(model_name)
                    logger.info(f"Successfully pulled Ollama model '{model_name}'.")
                except Exception as pull_e:
@ -66,9 +66,9 @@ class Processor(LlmService):

        # Use provided model or fall back to default
        model_name = model or self.default_model
-        
+
        # Ensure the model exists/is pulled
-        self._ensure_model(model_name)
+        await self._ensure_model(model_name)
        # Use provided temperature or fall back to default
        effective_temperature = temperature if temperature is not None else self.temperature

@ -79,7 +79,7 @@ class Processor(LlmService):

        try:

-            response = self.llm.generate(model_name, prompt, options={'temperature': effective_temperature})
+            response = await self.llm.generate(model_name, prompt, options={'temperature': effective_temperature})

            response_text = response['response']
            logger.debug("Sending response...")
@ -113,7 +113,7 @@ class Processor(LlmService):
        model_name = model or self.default_model

        # Ensure the model exists/is pulled
-        self._ensure_model(model_name)
+        await self._ensure_model(model_name)

        effective_temperature = temperature if temperature is not None else self.temperature

@ -123,7 +123,7 @@ class Processor(LlmService):
        prompt = system + "\n\n" + prompt

        try:
-            stream = self.llm.generate(
+            stream = await self.llm.generate(
                model_name,
                prompt,
                options={'temperature': effective_temperature},
@ -133,7 +133,7 @@ class Processor(LlmService):
            total_input_tokens = 0
            total_output_tokens = 0

-            for chunk in stream:
+            async for chunk in stream:
                if 'response' in chunk and chunk['response']:
                    yield LlmChunk(
                        text=chunk['response'],
--- a/trustgraph-flow/trustgraph/tables/iam.py
+++ b/trustgraph-flow/trustgraph/tables/iam.py
@ -0,0 +1,436 @@
+"""
+IAM Cassandra table store.
+
+Tables:
+  - iam_workspaces (id primary key)
+  - iam_users (id primary key) + iam_users_by_username lookup table
+    (workspace, username) -> id
+  - iam_api_keys (key_hash primary key) with secondary index on user_id
+  - iam_signing_keys (kid primary key) — RSA keypairs for JWT signing
+
+See docs/tech-specs/iam-protocol.md for the wire-level context.
+"""
+
+import logging
+
+from cassandra.cluster import Cluster
+from cassandra.auth import PlainTextAuthProvider
+from ssl import SSLContext, PROTOCOL_TLSv1_2
+
+from . cassandra_async import async_execute
+
+logger = logging.getLogger(__name__)
+
+
+class IamTableStore:
+
+    def __init__(
+            self,
+            cassandra_host, cassandra_username, cassandra_password,
+            keyspace,
+    ):
+        self.keyspace = keyspace
+
+        logger.info("IAM: connecting to Cassandra...")
+
+        if isinstance(cassandra_host, str):
+            cassandra_host = [h.strip() for h in cassandra_host.split(",")]
+
+        if cassandra_username and cassandra_password:
+            ssl_context = SSLContext(PROTOCOL_TLSv1_2)
+            auth_provider = PlainTextAuthProvider(
+                username=cassandra_username, password=cassandra_password,
+            )
+            self.cluster = Cluster(
+                cassandra_host,
+                auth_provider=auth_provider,
+                ssl_context=ssl_context,
+            )
+        else:
+            self.cluster = Cluster(cassandra_host)
+
+        self.cassandra = self.cluster.connect()
+
+        logger.info("IAM: connected.")
+
+        self._ensure_schema()
+        self._prepare_statements()
+
+    def _ensure_schema(self):
+        # FIXME: Replication factor should be configurable.
+        self.cassandra.execute(f"""
+            create keyspace if not exists {self.keyspace}
+                with replication = {{
+                    'class' : 'SimpleStrategy',
+                    'replication_factor' : 1
+                }};
+        """)
+        self.cassandra.set_keyspace(self.keyspace)
+
+        self.cassandra.execute("""
+            CREATE TABLE IF NOT EXISTS iam_workspaces (
+                id text PRIMARY KEY,
+                name text,
+                enabled boolean,
+                created timestamp
+            );
+        """)
+
+        self.cassandra.execute("""
+            CREATE TABLE IF NOT EXISTS iam_users (
+                id text PRIMARY KEY,
+                workspace text,
+                username text,
+                name text,
+                email text,
+                password_hash text,
+                roles set<text>,
+                enabled boolean,
+                must_change_password boolean,
+                created timestamp
+            );
+        """)
+
+        self.cassandra.execute("""
+            CREATE TABLE IF NOT EXISTS iam_users_by_username (
+                workspace text,
+                username text,
+                user_id text,
+                PRIMARY KEY ((workspace), username)
+            );
+        """)
+
+        self.cassandra.execute("""
+            CREATE TABLE IF NOT EXISTS iam_api_keys (
+                key_hash text PRIMARY KEY,
+                id text,
+                user_id text,
+                name text,
+                prefix text,
+                expires timestamp,
+                created timestamp,
+                last_used timestamp
+            );
+        """)
+
+        self.cassandra.execute("""
+            CREATE INDEX IF NOT EXISTS iam_api_keys_user_id_idx
+            ON iam_api_keys (user_id);
+        """)
+
+        self.cassandra.execute("""
+            CREATE INDEX IF NOT EXISTS iam_api_keys_id_idx
+            ON iam_api_keys (id);
+        """)
+
+        self.cassandra.execute("""
+            CREATE TABLE IF NOT EXISTS iam_signing_keys (
+                kid text PRIMARY KEY,
+                private_pem text,
+                public_pem text,
+                created timestamp,
+                retired timestamp
+            );
+        """)
+
+        logger.info("IAM: Cassandra schema OK.")
+
+    def _prepare_statements(self):
+        c = self.cassandra
+
+        self.put_workspace_stmt = c.prepare("""
+            INSERT INTO iam_workspaces (id, name, enabled, created)
+            VALUES (?, ?, ?, ?)
+        """)
+        self.get_workspace_stmt = c.prepare("""
+            SELECT id, name, enabled, created FROM iam_workspaces
+            WHERE id = ?
+        """)
+        self.list_workspaces_stmt = c.prepare("""
+            SELECT id, name, enabled, created FROM iam_workspaces
+        """)
+
+        self.put_user_stmt = c.prepare("""
+            INSERT INTO iam_users (
+                id, workspace, username, name, email, password_hash,
+                roles, enabled, must_change_password, created
+            )
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """)
+        self.get_user_stmt = c.prepare("""
+            SELECT id, workspace, username, name, email, password_hash,
+                   roles, enabled, must_change_password, created
+            FROM iam_users WHERE id = ?
+        """)
+        self.list_users_by_workspace_stmt = c.prepare("""
+            SELECT id, workspace, username, name, email, password_hash,
+                   roles, enabled, must_change_password, created
+            FROM iam_users WHERE workspace = ? ALLOW FILTERING
+        """)
+        self.list_users_stmt = c.prepare("""
+            SELECT id, workspace, username, name, email, password_hash,
+                   roles, enabled, must_change_password, created
+            FROM iam_users
+        """)
+
+        self.put_username_lookup_stmt = c.prepare("""
+            INSERT INTO iam_users_by_username (workspace, username, user_id)
+            VALUES (?, ?, ?)
+        """)
+        self.get_user_id_by_username_stmt = c.prepare("""
+            SELECT user_id FROM iam_users_by_username
+            WHERE workspace = ? AND username = ?
+        """)
+        self.delete_username_lookup_stmt = c.prepare("""
+            DELETE FROM iam_users_by_username
+            WHERE workspace = ? AND username = ?
+        """)
+        self.delete_user_stmt = c.prepare("""
+            DELETE FROM iam_users WHERE id = ?
+        """)
+
+        self.put_api_key_stmt = c.prepare("""
+            INSERT INTO iam_api_keys (
+                key_hash, id, user_id, name, prefix, expires,
+                created, last_used
+            )
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+        """)
+        self.get_api_key_by_hash_stmt = c.prepare("""
+            SELECT key_hash, id, user_id, name, prefix, expires,
+                   created, last_used
+            FROM iam_api_keys WHERE key_hash = ?
+        """)
+        self.get_api_key_by_id_stmt = c.prepare("""
+            SELECT key_hash, id, user_id, name, prefix, expires,
+                   created, last_used
+            FROM iam_api_keys WHERE id = ?
+        """)
+        self.list_api_keys_by_user_stmt = c.prepare("""
+            SELECT key_hash, id, user_id, name, prefix, expires,
+                   created, last_used
+            FROM iam_api_keys WHERE user_id = ?
+        """)
+        self.delete_api_key_stmt = c.prepare("""
+            DELETE FROM iam_api_keys WHERE key_hash = ?
+        """)
+
+        self.put_signing_key_stmt = c.prepare("""
+            INSERT INTO iam_signing_keys (
+                kid, private_pem, public_pem, created, retired
+            )
+            VALUES (?, ?, ?, ?, ?)
+        """)
+        self.list_signing_keys_stmt = c.prepare("""
+            SELECT kid, private_pem, public_pem, created, retired
+            FROM iam_signing_keys
+        """)
+        self.retire_signing_key_stmt = c.prepare("""
+            UPDATE iam_signing_keys SET retired = ? WHERE kid = ?
+        """)
+
+        self.update_user_profile_stmt = c.prepare("""
+            UPDATE iam_users
+            SET name = ?, email = ?, roles = ?, enabled = ?,
+                must_change_password = ?
+            WHERE id = ?
+        """)
+        self.update_user_password_stmt = c.prepare("""
+            UPDATE iam_users
+            SET password_hash = ?, must_change_password = ?
+            WHERE id = ?
+        """)
+        self.update_user_enabled_stmt = c.prepare("""
+            UPDATE iam_users SET enabled = ? WHERE id = ?
+        """)
+
+        self.update_workspace_stmt = c.prepare("""
+            UPDATE iam_workspaces SET name = ?, enabled = ?
+            WHERE id = ?
+        """)
+
+    # ------------------------------------------------------------------
+    # Workspaces
+    # ------------------------------------------------------------------
+
+    async def put_workspace(self, id, name, enabled, created):
+        await async_execute(
+            self.cassandra, self.put_workspace_stmt,
+            (id, name, enabled, created),
+        )
+
+    async def get_workspace(self, id):
+        rows = await async_execute(
+            self.cassandra, self.get_workspace_stmt, (id,),
+        )
+        return rows[0] if rows else None
+
+    async def list_workspaces(self):
+        return await async_execute(
+            self.cassandra, self.list_workspaces_stmt,
+        )
+
+    # ------------------------------------------------------------------
+    # Users
+    # ------------------------------------------------------------------
+
+    async def put_user(
+            self, id, workspace, username, name, email, password_hash,
+            roles, enabled, must_change_password, created,
+    ):
+        await async_execute(
+            self.cassandra, self.put_user_stmt,
+            (
+                id, workspace, username, name, email, password_hash,
+                set(roles) if roles else set(),
+                enabled, must_change_password, created,
+            ),
+        )
+        await async_execute(
+            self.cassandra, self.put_username_lookup_stmt,
+            (workspace, username, id),
+        )
+
+    async def get_user(self, id):
+        rows = await async_execute(
+            self.cassandra, self.get_user_stmt, (id,),
+        )
+        return rows[0] if rows else None
+
+    async def get_user_id_by_username(self, workspace, username):
+        rows = await async_execute(
+            self.cassandra, self.get_user_id_by_username_stmt,
+            (workspace, username),
+        )
+        return rows[0][0] if rows else None
+
+    async def list_users_by_workspace(self, workspace):
+        return await async_execute(
+            self.cassandra, self.list_users_by_workspace_stmt, (workspace,),
+        )
+
+    async def list_users(self):
+        """List every user across the deployment.  Used by the
+        system-level list-users handler when no workspace filter is
+        supplied; the gateway has already authorised the call against
+        the caller's authority."""
+        return await async_execute(
+            self.cassandra, self.list_users_stmt, (),
+        )
+
+    async def delete_user(self, id):
+        await async_execute(
+            self.cassandra, self.delete_user_stmt, (id,),
+        )
+
+    async def delete_username_lookup(self, workspace, username):
+        await async_execute(
+            self.cassandra, self.delete_username_lookup_stmt,
+            (workspace, username),
+        )
+
+    # ------------------------------------------------------------------
+    # API keys
+    # ------------------------------------------------------------------
+
+    async def put_api_key(
+            self, key_hash, id, user_id, name, prefix, expires,
+            created, last_used,
+    ):
+        await async_execute(
+            self.cassandra, self.put_api_key_stmt,
+            (key_hash, id, user_id, name, prefix, expires,
+             created, last_used),
+        )
+
+    async def get_api_key_by_hash(self, key_hash):
+        rows = await async_execute(
+            self.cassandra, self.get_api_key_by_hash_stmt, (key_hash,),
+        )
+        return rows[0] if rows else None
+
+    async def get_api_key_by_id(self, id):
+        rows = await async_execute(
+            self.cassandra, self.get_api_key_by_id_stmt, (id,),
+        )
+        return rows[0] if rows else None
+
+    async def list_api_keys_by_user(self, user_id):
+        return await async_execute(
+            self.cassandra, self.list_api_keys_by_user_stmt, (user_id,),
+        )
+
+    async def delete_api_key(self, key_hash):
+        await async_execute(
+            self.cassandra, self.delete_api_key_stmt, (key_hash,),
+        )
+
+    # ------------------------------------------------------------------
+    # Signing keys
+    # ------------------------------------------------------------------
+
+    async def put_signing_key(self, kid, private_pem, public_pem,
+                              created, retired):
+        await async_execute(
+            self.cassandra, self.put_signing_key_stmt,
+            (kid, private_pem, public_pem, created, retired),
+        )
+
+    async def list_signing_keys(self):
+        return await async_execute(
+            self.cassandra, self.list_signing_keys_stmt,
+        )
+
+    async def retire_signing_key(self, kid, retired):
+        await async_execute(
+            self.cassandra, self.retire_signing_key_stmt,
+            (retired, kid),
+        )
+
+    # ------------------------------------------------------------------
+    # User partial updates
+    # ------------------------------------------------------------------
+
+    async def update_user_profile(
+            self, id, name, email, roles, enabled, must_change_password,
+    ):
+        await async_execute(
+            self.cassandra, self.update_user_profile_stmt,
+            (
+                name, email,
+                set(roles) if roles else set(),
+                enabled, must_change_password, id,
+            ),
+        )
+
+    async def update_user_password(
+            self, id, password_hash, must_change_password,
+    ):
+        await async_execute(
+            self.cassandra, self.update_user_password_stmt,
+            (password_hash, must_change_password, id),
+        )
+
+    async def update_user_enabled(self, id, enabled):
+        await async_execute(
+            self.cassandra, self.update_user_enabled_stmt,
+            (enabled, id),
+        )
+
+    # ------------------------------------------------------------------
+    # Workspace updates
+    # ------------------------------------------------------------------
+
+    async def update_workspace(self, id, name, enabled):
+        await async_execute(
+            self.cassandra, self.update_workspace_stmt,
+            (name, enabled, id),
+        )
+
+    # ------------------------------------------------------------------
+    # Bootstrap helpers
+    # ------------------------------------------------------------------
+
+    async def any_workspace_exists(self):
+        rows = await self.list_workspaces()
+        return bool(rows)