diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 07af8db9..02c546df 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -75,6 +75,13 @@ jobs:
- name: Checkout
uses: actions/checkout@v4
+ - name: "Free up some disk space"
+ run: |
+ sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc
+ sudo rm -rf /opt/hostedtoolcache/CodeQL
+ podman image prune --all --force
+ podman builder prune -a -f
+
- name: Docker Hub token
run: echo ${{ secrets.DOCKER_SECRET }} > docker-token.txt
diff --git a/README.md b/README.md
index b1d94c52..17292e26 100644
--- a/README.md
+++ b/README.md
@@ -7,8 +7,6 @@
[](https://discord.gg/sQMwkRz5GX) [](https://deepwiki.com/trustgraph-ai/trustgraph)
-[](https://cossmology.com/organizations/trustgraph)
-
[**Website**](https://trustgraph.ai) | [**Docs**](https://docs.trustgraph.ai) | [**YouTube**](https://www.youtube.com/@TrustGraphAI?sub_confirmation=1) | [**Configuration Terminal**](https://config-ui.demo.trustgraph.ai/) | [**Discord**](https://discord.gg/sQMwkRz5GX) | [**Blog**](https://blog.trustgraph.ai/subscribe)
diff --git a/dev-tools/tests/smoke/smoke_ws_queries.py b/dev-tools/tests/smoke/smoke_ws_queries.py
new file mode 100755
index 00000000..c6a4dfb6
--- /dev/null
+++ b/dev-tools/tests/smoke/smoke_ws_queries.py
@@ -0,0 +1,475 @@
+#!/usr/bin/env python3
+"""
+WebSocket smoke / load test that hammers a TrustGraph gateway with a
+mix of `embeddings`, `graph-embeddings`, and `triples` queries while
+keeping a target number of in-flight requests at all times.
+
+Useful for reproducing the "worker hangs after a while, all subsequent
+requests time out" failure mode — leaves enough load on the system to
+saturate worker concurrency and reports per-service success/timeout
+rates and latency distributions over time.
+
+Usage:
+ smoke_ws_queries.py --flow onto-rag --duration 120 --concurrency 20
+
+Connects via /api/v1/socket using the first-frame auth protocol.
+"""
+
+import argparse
+import asyncio
+import json
+import os
+import random
+import statistics
+import sys
+import time
+import uuid
+from collections import defaultdict
+from typing import Any
+
+import websockets
+
+
+DEFAULT_TEXT = (
+ "What caused the space shuttle to explode and what were the "
+ "main factors leading to the disaster?"
+)
+
+
+class Stats:
+ """Per-service rolling counters and latency samples."""
+
+ def __init__(self) -> None:
+ self.sent = 0
+ self.ok = 0
+ self.err = 0
+ self.timeout = 0
+ self.latencies_ms: list[float] = []
+
+ def record_ok(self, latency_ms: float) -> None:
+ self.ok += 1
+ self.latencies_ms.append(latency_ms)
+
+ def record_err(self) -> None:
+ self.err += 1
+
+ def record_timeout(self) -> None:
+ self.timeout += 1
+
+ def percentile(self, p: float) -> float:
+ if not self.latencies_ms:
+ return 0.0
+ s = sorted(self.latencies_ms)
+ idx = min(len(s) - 1, int(len(s) * p))
+ return s[idx]
+
+ def summary(self) -> str:
+ if self.latencies_ms:
+ mn = min(self.latencies_ms)
+ mx = max(self.latencies_ms)
+ mean = statistics.mean(self.latencies_ms)
+ p50 = self.percentile(0.50)
+ p95 = self.percentile(0.95)
+ p99 = self.percentile(0.99)
+ lat = (
+ f"min={mn:.0f} mean={mean:.0f} p50={p50:.0f} "
+ f"p95={p95:.0f} p99={p99:.0f} max={mx:.0f} ms"
+ )
+ else:
+ lat = "no successful samples"
+ return (
+ f"sent={self.sent} ok={self.ok} err={self.err} "
+ f"timeout={self.timeout} | {lat}"
+ )
+
+
+class WSClient:
+ """Thin async websocket client with first-frame auth and a shared
+ reader task that demuxes responses to per-request asyncio queues."""
+
+ def __init__(
+ self, url: str, token: str | None, workspace: str,
+ ping_timeout: int,
+ ) -> None:
+ self.url = url
+ self.token = token
+ self.workspace = workspace
+ self.ping_timeout = ping_timeout
+ self._ws: Any = None
+ self._pending: dict[str, asyncio.Queue] = {}
+ self._reader_task: asyncio.Task | None = None
+ self._closed = asyncio.Event()
+
+ async def connect(self) -> None:
+ ws_url = self.url.rstrip("/") + "/api/v1/socket"
+ if ws_url.startswith("http://"):
+ ws_url = "ws://" + ws_url[len("http://"):]
+ elif ws_url.startswith("https://"):
+ ws_url = "wss://" + ws_url[len("https://"):]
+ elif not (
+ ws_url.startswith("ws://") or ws_url.startswith("wss://")
+ ):
+ ws_url = "ws://" + ws_url
+
+ self._ws = await websockets.connect(
+ ws_url,
+ ping_interval=20,
+ ping_timeout=self.ping_timeout,
+ max_size=64 * 1024 * 1024,
+ )
+
+ if self.token:
+ # First-frame auth handshake.
+ await self._ws.send(json.dumps({
+ "type": "auth", "token": self.token,
+ }))
+ raw = await asyncio.wait_for(self._ws.recv(), timeout=10)
+ resp = json.loads(raw)
+ if resp.get("type") != "auth-ok":
+ await self._ws.close()
+ raise RuntimeError(f"auth failed: {resp}")
+ if "workspace" in resp:
+ # Server-resolved workspace overrides the user-supplied
+ # one, mirroring AsyncSocketClient behaviour.
+ self.workspace = resp["workspace"]
+ else:
+ print(
+ "WARNING: no token provided — skipping auth handshake. "
+ "Requests will be rejected unless the gateway is "
+ "running without IAM enforcement.",
+ file=sys.stderr,
+ )
+
+ self._reader_task = asyncio.create_task(self._reader())
+
+ async def _reader(self) -> None:
+ try:
+ async for raw in self._ws:
+ msg = json.loads(raw)
+ rid = msg.get("id")
+ if rid and rid in self._pending:
+ await self._pending[rid].put(msg)
+ except websockets.exceptions.ConnectionClosed:
+ pass
+ except Exception as e:
+ for q in list(self._pending.values()):
+ try:
+ q.put_nowait({"error": {"message": str(e)}})
+ except Exception:
+ pass
+ finally:
+ self._closed.set()
+
+ async def request(
+ self, service: str, flow: str | None, body: dict, timeout: float,
+ ) -> tuple[dict | None, str | None, float]:
+ """Send one request, await final response.
+
+ Returns ``(response, error, latency_ms)``. ``response`` is None
+ on error/timeout. ``error`` describes the failure category.
+ """
+ rid = str(uuid.uuid4())
+ q: asyncio.Queue = asyncio.Queue()
+ self._pending[rid] = q
+ env = {
+ "id": rid,
+ "workspace": self.workspace,
+ "service": service,
+ "request": body,
+ }
+ if flow:
+ env["flow"] = flow
+
+ t0 = time.monotonic()
+ try:
+ await self._ws.send(json.dumps(env))
+ while True:
+ try:
+ msg = await asyncio.wait_for(q.get(), timeout=timeout)
+ except asyncio.TimeoutError:
+ return None, "timeout", (time.monotonic() - t0) * 1000
+ if "error" in msg and msg["error"]:
+ err = msg["error"]
+ err_msg = (
+ err.get("message") if isinstance(err, dict) else str(err)
+ )
+ return None, f"error: {err_msg}", (time.monotonic() - t0) * 1000
+ if msg.get("complete"):
+ return msg.get("response"), None, (time.monotonic() - t0) * 1000
+ # Otherwise an intermediate streaming chunk — keep waiting.
+ finally:
+ self._pending.pop(rid, None)
+
+ async def close(self) -> None:
+ if self._ws is not None:
+ await self._ws.close()
+ if self._reader_task is not None:
+ try:
+ await asyncio.wait_for(self._reader_task, timeout=2)
+ except (asyncio.TimeoutError, asyncio.CancelledError):
+ pass
+
+
+def parse_args() -> argparse.Namespace:
+ p = argparse.ArgumentParser(description=__doc__)
+ p.add_argument(
+ "--url",
+ default=os.getenv("TRUSTGRAPH_URL", "http://localhost:8088/"),
+ help="Gateway URL (http or ws). Default: %(default)s",
+ )
+ p.add_argument(
+ "--token",
+ default=os.getenv("TRUSTGRAPH_TOKEN"),
+ help="Auth token (or set TRUSTGRAPH_TOKEN). Optional — if "
+ "omitted, the auth handshake is skipped (only works "
+ "when the gateway is running without IAM enforcement).",
+ )
+ p.add_argument(
+ "--workspace", default="default",
+ help="Workspace. Default: %(default)s",
+ )
+ p.add_argument(
+ "--flow", required=True,
+ help="Flow id. Comma-separated for round-robin across flows "
+ "(e.g. onto-rag,doc-rag).",
+ )
+ p.add_argument(
+ "--duration", type=int, default=60,
+ help="Test duration in seconds. Default: %(default)s",
+ )
+ p.add_argument(
+ "--concurrency", type=int, default=15,
+ help="Target in-flight request count. Default: %(default)s",
+ )
+ p.add_argument(
+ "--services",
+ default="embeddings,graph-embeddings,triples",
+ help="Comma-separated services to exercise. "
+ "Default: %(default)s",
+ )
+ p.add_argument(
+ "--limit", type=int, default=3,
+ help="limit for triples / graph-embeddings queries. "
+ "Default: %(default)s",
+ )
+ p.add_argument(
+ "--collection", default="default",
+ help="Collection. Default: %(default)s",
+ )
+ p.add_argument(
+ "--text", default=DEFAULT_TEXT,
+ help="Text to embed for embeddings/seed.",
+ )
+ p.add_argument(
+ "--vector-dim", type=int, default=384,
+ help="Dimension of synthetic vector when --no-seed is used. "
+ "Default: %(default)s",
+ )
+ p.add_argument(
+ "--no-seed", action="store_true",
+ help="Skip the embeddings warm-up call. Use a random vector "
+ "for graph-embeddings queries instead.",
+ )
+ p.add_argument(
+ "--request-timeout", type=float, default=30.0,
+ help="Per-request timeout (seconds). Default: %(default)s",
+ )
+ p.add_argument(
+ "--report-interval", type=float, default=5.0,
+ help="How often to print stats (seconds). Default: %(default)s",
+ )
+ p.add_argument(
+ "--ping-timeout", type=int, default=120,
+ help="Websocket ping timeout. Default: %(default)s",
+ )
+ p.add_argument(
+ "--seed", type=int, default=None,
+ help="Random seed (for reproducibility).",
+ )
+ return p.parse_args()
+
+
+async def seed_vector(
+ client: WSClient, flow: str, text: str, timeout: float,
+) -> list[float]:
+ """Issue one embeddings request to obtain a real vector that
+ later graph-embeddings calls can reuse."""
+ resp, err, _ = await client.request(
+ "embeddings", flow, {"texts": [text]}, timeout,
+ )
+ if err or not resp:
+ raise RuntimeError(f"seed embeddings failed: {err or resp}")
+ vectors = resp.get("vectors")
+ if not vectors:
+ raise RuntimeError(f"seed embeddings: no vectors in response: {resp}")
+ return vectors[0]
+
+
+def make_request_body(
+ service: str, args: argparse.Namespace, vector: list[float],
+) -> dict:
+ if service == "embeddings":
+ return {"texts": [args.text]}
+ if service == "graph-embeddings":
+ return {
+ "vector": vector,
+ "limit": args.limit,
+ "collection": args.collection,
+ }
+ if service == "triples":
+ return {
+ "limit": args.limit,
+ "collection": args.collection,
+ }
+ raise ValueError(f"Unknown service: {service}")
+
+
+async def worker(
+ name: int,
+ client: WSClient,
+ flows: list[str],
+ services: list[str],
+ args: argparse.Namespace,
+ vector: list[float],
+ stats: dict[str, Stats],
+ in_flight: dict[str, int],
+ stop_at: float,
+) -> None:
+ rng = random.Random((args.seed or 0) + name)
+ while time.monotonic() < stop_at:
+ svc = rng.choice(services)
+ flow = rng.choice(flows)
+ body = make_request_body(svc, args, vector)
+
+ stats[svc].sent += 1
+ in_flight[svc] += 1
+ try:
+ resp, err, lat = await client.request(
+ svc, flow, body, args.request_timeout,
+ )
+ if err == "timeout":
+ stats[svc].record_timeout()
+ elif err:
+ stats[svc].record_err()
+ else:
+ stats[svc].record_ok(lat)
+ except Exception as e:
+ stats[svc].record_err()
+ print(f"worker {name}: unexpected {svc} exception: {e}",
+ file=sys.stderr)
+ finally:
+ in_flight[svc] -= 1
+
+
+async def reporter(
+ services: list[str],
+ stats: dict[str, Stats],
+ in_flight: dict[str, int],
+ stop_at: float,
+ interval: float,
+) -> None:
+ started = time.monotonic()
+ last_sent = {s: 0 for s in services}
+ while time.monotonic() < stop_at:
+ await asyncio.sleep(interval)
+ now = time.monotonic()
+ elapsed = now - started
+ total_inflight = sum(in_flight.values())
+ print(
+ f"\n[{elapsed:6.1f}s] in-flight={total_inflight} "
+ f"per-svc={dict(in_flight)}"
+ )
+ for svc in services:
+ s = stats[svc]
+ delta = s.sent - last_sent[svc]
+ rate = delta / interval
+ last_sent[svc] = s.sent
+ print(f" {svc:20s} {rate:6.1f}/s | {s.summary()}")
+
+
+async def run(args: argparse.Namespace) -> int:
+ if args.seed is not None:
+ random.seed(args.seed)
+
+ services = [s.strip() for s in args.services.split(",") if s.strip()]
+ flows = [f.strip() for f in args.flow.split(",") if f.strip()]
+ valid = {"embeddings", "graph-embeddings", "triples"}
+ bad = [s for s in services if s not in valid]
+ if bad:
+ print(f"ERROR: unknown service(s): {bad}. "
+ f"Supported: {sorted(valid)}", file=sys.stderr)
+ return 2
+
+ client = WSClient(
+ args.url, args.token, args.workspace, args.ping_timeout,
+ )
+ print(f"Connecting to {args.url} ...")
+ await client.connect()
+ print(f"Connected. workspace={client.workspace} flows={flows} "
+ f"services={services} concurrency={args.concurrency} "
+ f"duration={args.duration}s")
+
+ if "graph-embeddings" in services and not args.no_seed:
+ print("Seeding embedding vector ...")
+ vector = await seed_vector(
+ client, flows[0], args.text, args.request_timeout,
+ )
+ print(f"Got vector of length {len(vector)}")
+ else:
+ vector = [random.uniform(-1.0, 1.0) for _ in range(args.vector_dim)]
+
+ stats: dict[str, Stats] = defaultdict(Stats)
+ in_flight: dict[str, int] = defaultdict(int)
+ for svc in services:
+ stats[svc] # initialise
+ in_flight[svc] = 0
+
+ stop_at = time.monotonic() + args.duration
+ print(f"Starting load: {args.concurrency} workers for "
+ f"{args.duration}s ...")
+
+ workers = [
+ asyncio.create_task(
+ worker(
+ i, client, flows, services, args, vector,
+ stats, in_flight, stop_at,
+ )
+ )
+ for i in range(args.concurrency)
+ ]
+ rep = asyncio.create_task(
+ reporter(services, stats, in_flight, stop_at, args.report_interval)
+ )
+
+ try:
+ await asyncio.gather(*workers)
+ finally:
+ rep.cancel()
+ try:
+ await rep
+ except asyncio.CancelledError:
+ pass
+
+ print("\n=== Final results ===")
+ any_failures = False
+ for svc in services:
+ s = stats[svc]
+ print(f" {svc:20s} {s.summary()}")
+ if s.timeout > 0 or s.err > 0:
+ any_failures = True
+
+ await client.close()
+
+ return 1 if any_failures else 0
+
+
+def main() -> int:
+ args = parse_args()
+ try:
+ return asyncio.run(run(args))
+ except KeyboardInterrupt:
+ return 130
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/docs/tech-specs/bootstrap.md b/docs/tech-specs/bootstrap.md
new file mode 100644
index 00000000..af7387d1
--- /dev/null
+++ b/docs/tech-specs/bootstrap.md
@@ -0,0 +1,297 @@
+---
+layout: default
+title: "Bootstrap Framework Technical Specification"
+parent: "Tech Specs"
+---
+
+# Bootstrap Framework Technical Specification
+
+## Overview
+
+A generic, pluggable framework for running one-time initialisation steps
+against a TrustGraph deployment — replacing the dedicated
+`tg-init-trustgraph` container with a long-running processor that
+converges the system to a desired initial state and then idles.
+
+The framework is content-agnostic. It knows how to run, retry,
+mark-as-done, and surface failures; the actual init work lives in
+small pluggable classes called **initialisers**. Core initialisers
+ship in the `trustgraph-flow` package; enterprise and third-party
+initialisers can be loaded by dotted path without any core code
+change.
+
+## Motivation
+
+The existing `tg-init-trustgraph` is a one-shot CLI run in its own
+container. It performs two very different jobs (Pulsar topology
+setup and config seeding) in a single script, is wasteful as a whole
+container, cannot handle partial-success states, and has no way to
+extend the boot process with enterprise-specific concerns (user
+provisioning, workspace initialisation, IAM scaffolding) without
+forking the tool.
+
+A pluggable, long-running reconciler addresses all of this and slots
+naturally into the existing processor-group model.
+
+## Design
+
+### Bootstrapper Processor
+
+A single `AsyncProcessor` subclass. One entry in a processor group.
+Parameters include the processor's own identity and a list of
+**initialiser specifications** — each spec names a class (by dotted
+path), a unique instance name, a flag string, and the parameters
+that will be passed to the initialiser's constructor.
+
+On each wake the bootstrapper does the following, in order:
+
+1. Open a short-lived context (config client, flow-svc client,
+ logger). The context is torn down at the end of the wake so
+ steady-state idle cost is effectively nil.
+2. Run all **pre-service initialisers** (those that opt out of the
+ service gate — principally `PulsarTopology`, which must run
+ before the services it gates on can even come up).
+3. Check the **service gate**: cheap round-trips to config-svc and
+ flow-svc. If either fails, skip to the sleep step using the
+ short gate-retry cadence.
+4. Run all **post-service initialisers** that haven't already
+ completed at the currently-configured flag.
+5. Sleep. Cadence adapts to state (see below).
+
+### Initialiser Contract
+
+An initialiser is a class with:
+
+- A class-level `name` identifier, unique within the bootstrapper's
+ configuration. This is the key under which completion state is
+ stored.
+- A class-level `wait_for_services` flag. When `True` (the default)
+ the initialiser runs only after the service gate passes. When
+ `False`, it runs before the gate, on every wake.
+- A constructor that accepts the initialiser's own params as kwargs.
+- An async `run(ctx, old_flag, new_flag)` method that performs the
+ init work and returns on success. Any raised exception is
+ logged and treated as a transient failure — the stored flag is
+ not updated and the initialiser will re-run on the next cycle.
+
+`old_flag` is the previously-stored flag string, or `None` if the
+initialiser has never successfully run in this deployment. `new_flag`
+is the flag the operator has configured for this run. This pair
+lets an initialiser distinguish a clean first-run from a migration
+between flag versions and behave accordingly (see "Flag change and
+re-run safety" below).
+
+### Context
+
+The context is the bootstrapper-owned object passed to every
+initialiser's `run()` method. Its fields are deliberately narrow:
+
+| Field | Purpose |
+|---|---|
+| `logger` | A child logger named for the initialiser instance |
+| `config` | A short-lived `ConfigClient` for config-svc reads/writes |
+| `flow` | A short-lived `RequestResponse` client for flow-svc |
+
+The context is always fully-populated regardless of which services
+a given initialiser uses, for symmetry. Additional fields may be
+added in future without breaking existing initialisers. Clients are
+started at the beginning of a wake cycle and stopped at the end.
+
+Initialisers that need services beyond config-svc and flow-svc are
+responsible for their own readiness checks and for raising cleanly
+when a prerequisite is not met.
+
+### Completion State
+
+Per-initialiser completion state is stored in the reserved
+`__system__` workspace, under a dedicated config type for bootstrap
+state. The stored value is the flag string that was configured when
+the initialiser last succeeded.
+
+On each cycle, for each initialiser, the bootstrapper reads the
+stored flag and compares it to the currently-configured flag. If
+they match, the initialiser is skipped silently. If they differ,
+the initialiser runs; on success, the stored flag is updated.
+
+Because the state lives in a reserved (`_`-prefixed) workspace, it
+is stored by config-svc but excluded from the config push broadcast.
+Live processors never see it and cannot act on it.
+
+### The Service Gate
+
+The gate is a cheap, bootstrapper-internal check that config-svc
+and flow-svc are both reachable and responsive. It is intentionally
+a simple pair of low-cost round-trips — a config list against
+`__system__` and a flow-svc `list-blueprints` — rather than any
+deeper health check.
+
+Its purpose is to avoid filling logs with noise and to concentrate
+retry effort during the brief window when services are coming up.
+The gate is applied only to initialisers with
+`wait_for_services=True` (the default); `False` is reserved for
+initialisers that set up infrastructure the gate itself depends on.
+
+### Adaptive Cadence
+
+The sleep between wake cycles is chosen from three tiers based on
+observed state:
+
+| Tier | Duration | When |
+|---|---|---|
+| Gate backoff | ~5 s | Services not responding — concentrate retry during startup |
+| Init retry | ~15 s | Gate passes but at least one initialiser is not yet at its configured flag — transient failures, waiting on prereqs, recently-bumped flag not yet applied |
+| Steady | ~300 s | All configured initialisers at their configured flag; gate passes; nothing to do |
+
+The short tiers ensure a fresh deployment converges quickly;
+steady state costs a single round-trip per initialiser every few
+minutes.
+
+### Failure Handling
+
+An initialiser raising an exception does not stop the bootstrapper
+or block other initialisers. Each initialiser in the cycle is
+attempted independently; failures are logged and retried on the next
+cycle. This means there is no ordered-DAG enforcement: order of
+initialisers in the configuration determines the attempt order
+within a cycle, but a dependency between two initialisers is
+expressed by the dependant raising cleanly when its prerequisite
+isn't satisfied. Over successive cycles the system converges.
+
+### Flag Change and Re-run Safety
+
+Each initialiser's completion state is a string flag chosen by the
+operator. Typically these follow a simple version pattern
+(`v1`, `v2`, ...), but the bootstrapper imposes no format.
+
+Changing the flag in the group configuration causes the
+corresponding initialiser to re-run on the next cycle. Initialisers
+must be written so that re-running after a flag bump is safe — they
+receive both the previous and the new flag and are responsible for
+either cleanly re-applying the work or performing a step-change
+migration from the prior state.
+
+This gives operators an explicit, visible mechanism for triggering
+re-initialisation. Re-runs are never implicit.
+
+## Core Initialisers
+
+The following initialisers ship in `trustgraph.bootstrap.initialisers`
+and cover the base deployment case.
+
+### PulsarTopology
+
+Creates the Pulsar tenant and the four namespaces
+(`flow`, `request`, `response`, `notify`) with appropriate
+retention policies if they don't exist.
+
+Opts out of the service gate (`wait_for_services = False`) because
+config-svc and flow-svc cannot come online until the Pulsar
+namespaces exist.
+
+Parameters: Pulsar admin URL, tenant name.
+
+Idempotent via the admin API (GET-then-PUT). Flag change causes
+re-evaluation of all namespaces; any absent are created.
+
+### TemplateSeed
+
+Populates the reserved `__template__` workspace from an external
+JSON seed file. The seed file has the standard shape of
+`{config-type: {config-key: value}}`.
+
+Runs post-gate. Parameters: path to the seed file, overwrite
+policy (upsert-missing only, or overwrite-all).
+
+On clean run, writes the whole file. On flag change, behaviour
+depends on the overwrite policy — typically upsert-missing so
+that operator-customised keys are preserved across seed-file
+upgrades.
+
+### WorkspaceInit
+
+Creates a named workspace and populates it from the seed file or
+from the full contents of the `__template__` workspace.
+
+Runs post-gate. Parameters: workspace name, source (seed file or
+`__template__`), optional `seed_file` path, `overwrite` flag.
+
+When `source` is `template`, the initialiser copies every config
+type and key present in `__template__` — there is no per-type
+selection. Deployments that want to seed only a subset should
+either curate the seed file they feed to `TemplateSeed` or use
+`source: seed-file` directly here.
+
+Raises cleanly if its source does not exist — depends on
+`TemplateSeed` having run in the same cycle or a prior one.
+
+### DefaultFlowStart
+
+Starts a specific flow in a specific workspace using a specific
+blueprint.
+
+Runs post-gate. Parameters: workspace name, flow id, blueprint
+name, description, optional parameter overrides.
+
+Separated from `WorkspaceInit` deliberately so that deployments
+which want a workspace without an auto-started flow can simply omit
+this initialiser from their bootstrap configuration.
+
+## Extensibility
+
+New initialisers are added by:
+
+1. Subclassing the initialiser base class.
+2. Implementing `run(ctx, old_flag, new_flag)`.
+3. Choosing `wait_for_services` (almost always `True`).
+4. Adding an entry in the bootstrapper's configuration with the new
+ class's dotted path.
+
+No core code changes are required to add an enterprise or third-party
+initialiser. Enterprise builds ship their own package with their own
+initialiser classes (e.g. `CreateAdminUser`, `ProvisionWorkspaces`)
+and reference them in the bootstrapper config alongside the core
+initialisers.
+
+## Reserved Workspaces
+
+This specification relies on the "reserved workspace" convention:
+
+- Any workspace id beginning with `_` is reserved.
+- Reserved workspaces are stored normally by config-svc but never
+ appear in the config push broadcast.
+- Live processors cannot react to reserved-workspace state.
+
+The bootstrapper uses two reserved workspaces:
+
+- `__template__` — factory-default seed config, readable by
+ initialisers that copy-from-template.
+- `__system__` — bootstrapper completion state (under the
+ `init-state` config type) and any other system-internal bookkeeping.
+
+See the reserved-workspace convention in the config service for
+the general rule and its enforcement.
+
+## Non-Goals
+
+- No DAG scheduling across initialisers. Dependencies are expressed
+ by the dependant failing cleanly until its prerequisite is met,
+ and convergence over subsequent cycles.
+- No parallel execution of initialisers within a cycle. A cycle runs
+ each initialiser sequentially.
+- No implicit re-runs. Re-running an initialiser requires an explicit
+ flag change by the operator.
+- No cross-initialiser atomicity. Each initialiser's completion is
+ recorded independently on its own success.
+
+## Operational Notes
+
+- Running the bootstrapper as a processor-group entry replaces the
+ previous `tg-init-trustgraph` container. The bootstrapper is also
+ CLI-invocable directly for standalone testing via
+ `Processor.launch(...)`.
+- First-boot convergence is typically a handful of short cycles
+ followed by a transition to the steady cadence. Deployments
+ should expect the first few minutes of logs to show
+ initialisation activity, thereafter effective silence.
+- Bumping a flag is a deliberate operational act. The log line
+ emitted on re-run makes the event visible for audit.
diff --git a/docs/tech-specs/capabilities.md b/docs/tech-specs/capabilities.md
new file mode 100644
index 00000000..7717cbc9
--- /dev/null
+++ b/docs/tech-specs/capabilities.md
@@ -0,0 +1,273 @@
+---
+layout: default
+title: "Capability Vocabulary Technical Specification"
+parent: "Tech Specs"
+---
+
+# Capability Vocabulary Technical Specification
+
+## Overview
+
+Every gateway endpoint maps to exactly one *capability* — a string
+from a closed vocabulary defined in this document. When the
+gateway authorises a request, it hands the IAM regime four things:
+the authenticated identity, the required capability, the
+operation's resource (the structured identifier of what's being
+operated on), and the operation's parameters. The IAM regime
+decides allow or deny; see the [IAM contract](iam-contract.md) for
+the full abstraction.
+
+A capability is a **permission**, not a structural classification.
+`graph:read` says "the caller may read graphs"; it does not say
+where graphs live or how they are addressed. The shape of a
+request — whether workspace appears in the URL, the envelope, or
+the body, and whether it is a resource address component or an
+operation parameter — is determined by what the operation operates
+on, not by what permission it requires. Permission and structure
+are orthogonal; the contract takes both.
+
+This document defines:
+
+- The **capability vocabulary** — the closed list of capability
+ strings the gateway uses as input to `authorise`. All IAM
+ regimes share this vocabulary; that's the only schema the
+ gateway and the IAM regime have to agree on.
+- The **open-source role bundles** — the role-and-scope table the
+ OSS IAM regime uses to answer `authorise` calls. Other regimes
+ answer the same call differently; the bundles below are an
+ OSS-specific implementation detail, not a contract assertion.
+
+A regime may evaluate `authorise` using role bundles (OSS), IdP
+group memberships, attribute-based policies, relationship tuples,
+or any other mechanism. The gateway is unaware of which. The
+capability strings — and the resource component vocabulary the
+gateway populates alongside them — are the only thing both sides
+have to agree on.
+
+## Motivation
+
+The original IAM spec used hierarchical "minimum role" checks
+(`admin` implies `writer` implies `reader`). That shape is simple
+but paints the role model into a corner: any enterprise need to
+grant a subset of admin abilities (helpdesk that can reset
+passwords but not edit flows; analyst who can query but not ingest)
+requires a protocol-level change.
+
+A capability vocabulary decouples "what a request needs" from
+"what roles a user has" and makes the role table pure data. The
+open-source bundles can stay coarse while the enterprise role
+table expands without any code movement.
+
+## Design
+
+### Capability string format
+
+`:` or `` (for capabilities with no
+natural read/write split). All lowercase, kebab-case for
+multi-word subsystems.
+
+### Capability list
+
+**Data plane**
+
+| Capability | Covers |
+|---|---|
+| `agent` | agent (query-only; no write counterpart) |
+| `graph:read` | graph-rag, graph-embeddings-query, triples-query, sparql, graph-embeddings-export, triples-export |
+| `graph:write` | triples-import, graph-embeddings-import |
+| `documents:read` | document-rag, document-embeddings-query, document-embeddings-export, entity-contexts-export, document-stream-export, library list / fetch |
+| `documents:write` | document-embeddings-import, entity-contexts-import, text-load, document-load, library add / replace / delete |
+| `rows:read` | rows-query, row-embeddings-query, nlp-query, structured-query, structured-diag |
+| `rows:write` | rows-import |
+| `llm` | text-completion, prompt (stateless invocation) |
+| `embeddings` | Raw text-embedding service (stateless compute; typed-data embedding stores live under their data-subject capability) |
+| `mcp` | mcp-tool |
+| `collections:read` | List / describe collections |
+| `collections:write` | Create / delete collections |
+| `knowledge:read` | List / get knowledge cores |
+| `knowledge:write` | Create / delete knowledge cores |
+
+**Control plane**
+
+| Capability | Covers |
+|---|---|
+| `config:read` | Read workspace config |
+| `config:write` | Write workspace config |
+| `flows:read` | List / describe flows, blueprints, flow classes |
+| `flows:write` | Start / stop / update flows |
+| `users:read` | List / get users within the workspace |
+| `users:write` | Create / update / disable users within the workspace |
+| `users:admin` | Assign / remove roles on users within the workspace |
+| `keys:self` | Create / revoke / list **own** API keys |
+| `keys:admin` | Create / revoke / list **any user's** API keys within the workspace |
+| `workspaces:admin` | Create / delete / disable workspaces (system-level) |
+| `iam:admin` | JWT signing-key rotation, IAM-level operations |
+| `metrics:read` | Prometheus metrics proxy |
+
+### Open-source role bundles
+
+The open-source edition ships three roles:
+
+| Role | Capabilities |
+|---|---|
+| `reader` | `agent`, `graph:read`, `documents:read`, `rows:read`, `llm`, `embeddings`, `mcp`, `collections:read`, `knowledge:read`, `flows:read`, `config:read`, `keys:self` |
+| `writer` | everything in `reader` **+** `graph:write`, `documents:write`, `rows:write`, `collections:write`, `knowledge:write` |
+| `admin` | everything in `writer` **+** `config:write`, `flows:write`, `users:read`, `users:write`, `users:admin`, `keys:admin`, `workspaces:admin`, `iam:admin`, `metrics:read` |
+
+Open-source bundles are deliberately coarse. `workspaces:admin` and
+`iam:admin` live inside `admin` without a separate role; a single
+`admin` user holds the keys to the whole deployment.
+
+### The `agent` capability and composition
+
+The `agent` capability is granted independently of the capabilities
+it composes under the hood (`llm`, `graph`, `documents`, `rows`,
+`mcp`, etc.). A user holding `agent` but not `llm` can still cause
+LLM invocations because the agent implementation chooses which
+services to invoke on the caller's behalf.
+
+This is deliberate. A common policy is "allow controlled access
+via the agent, deny raw model calls" — granting `agent` without
+granting `llm` expresses exactly that. An administrator granting
+`agent` should treat it as a grant of everything the agent
+composes at deployment time.
+
+### Authorisation evaluation (OSS regime)
+
+This section describes how the OSS IAM regime answers
+`authorise(identity, capability, resource, parameters)`. Other
+regimes answer the same contract differently; only the inputs (the
+capability vocabulary, the resource components, the parameter
+shape) are shared.
+
+For a request bearing a resolved set of roles
+`R = {r1, r2, ...}`, a required capability `c`, a resource, and
+parameters:
+
+```
+let target_workspace =
+ resource.workspace (workspace-/flow-level resources)
+ or parameters.workspace (system-level resources whose
+ parameters reference a workspace)
+ or unset (system-level operations with no
+ workspace context)
+
+allow if some role r in R has c in its capability bundle
+ and (target_workspace is unset
+ or r's workspace_scope permits target_workspace)
+```
+
+The OSS regime considers workspace from whichever role it plays in
+the operation:
+
+- For workspace-level and flow-level resources, the workspace lives
+ in `resource.workspace` and that is what the role's scope is
+ checked against.
+- For system-level resources whose operation parameters reference a
+ workspace (e.g. `create-user with workspace association W`),
+ workspace lives in `parameters.workspace` and that is what the
+ role's scope is checked against. The resource is system-level
+ (`resource = {}`) but the workspace constraint still bites.
+- For system-level operations with no workspace context (e.g.
+ `bootstrap`, `rotate-signing-key`), the workspace-scope check
+ collapses — only capability-bundle membership matters.
+
+No hierarchy, no precedence, no role-order sensitivity. A user
+with a single role is the common case; a user with multiple roles
+is allowed if any role independently grants both the capability
+and the relevant workspace scope.
+
+### Enforcement boundary
+
+Capability checks — and authentication — are applied **only at the
+API gateway**, on requests arriving from external callers.
+Operations originating inside the platform (backend service to
+backend service, agent to LLM, flow-svc to config-svc, bootstrap
+initialisers, scheduled reconcilers, autonomous flow steps) are
+**not capability-checked**. Backend services trust the workspace
+set by the gateway on inbound pub/sub messages and trust
+internally-originated messages without further authorisation.
+
+This policy has four consequences that are part of the spec, not
+accidents of implementation:
+
+1. **The gateway is the single trust boundary for user
+ authorisation.** Every backend service is a downstream consumer
+ of an already-authorised workspace scope.
+2. **Pub/sub carries workspace, not user identity.** Messages on
+ the bus do not carry credentials or the identity that originated
+ a request; they carry the resolved workspace only. This keeps
+ the bus protocol free of secrets and aligns with the workspace
+ resolver's role as the gateway-side narrowing step.
+3. **Composition is transitive.** Granting a capability that the
+ platform composes internally (for example, `agent`) transitively
+ grants everything that capability composes under the hood,
+ because the downstream calls are internal-origin and are not
+ re-checked. The composite nature of `agent` described above is
+ a consequence of this policy, not a special case.
+4. **Internal-origin operations have no user.** Bootstrap,
+ reconcilers, and other platform-initiated work act with
+ system-level authority. The workspace field on such messages
+ identifies which workspace's data is being touched, not who
+ asked.
+
+**Trust model.** Whoever has pub/sub access is implicitly trusted
+to act as any workspace. Defense-in-depth within the backend is
+not part of this design; the security perimeter is the gateway
+and the bus itself (TLS / network isolation between the bus and
+any untrusted network).
+
+### Unknown capabilities and unknown roles
+
+- An endpoint declaring an unknown capability is a server-side bug
+ and fails closed (403, logged).
+- A user carrying a role name that is not defined in the role table
+ is ignored for authorisation purposes and logged as a warning.
+ Behaviour is deterministic: unknown roles contribute zero
+ capabilities.
+
+### Capability scope
+
+Every capability is **implicitly scoped to the caller's resolved
+workspace**. A `users:write` capability does not permit a user
+in workspace `acme` to create users in workspace `beta` — the
+workspace-resolver has already narrowed the request to one
+workspace before the capability check runs. See the IAM
+specification for the workspace-resolver contract.
+
+The three exceptions are the system-level capabilities
+`workspaces:admin` and `iam:admin`, which operate across
+workspaces by definition, and `metrics:read`, which returns
+process-level series not scoped to any workspace.
+
+## Enterprise extensibility
+
+Enterprise editions extend the role table additively:
+
+```
+data-analyst: {query, library:read, collections:read, knowledge:read}
+helpdesk: {users:read, users:write, users:admin, keys:admin}
+data-engineer: writer + {flows:read, config:read}
+workspace-owner: admin − {workspaces:admin, iam:admin}
+```
+
+None of this requires a protocol change — the wire-protocol `roles`
+field on user records is already a set, the gateway's
+capability-check is already capability-based, and the capability
+vocabulary is closed. Enterprises may introduce roles whose bundles
+compose the same capabilities differently.
+
+When an enterprise introduces a new capability (e.g. for a feature
+that does not exist in open source), the capability string is
+added to the vocabulary and recognised by the gateway build that
+ships that feature.
+
+## References
+
+- [IAM Contract Specification](iam-contract.md) — the abstract
+ gateway↔IAM regime contract; capability strings are inputs to
+ `authorise`.
+- [Identity and Access Management Specification](iam.md)
+- [IAM Service Protocol Specification](iam-protocol.md) — the OSS
+ regime's wire-level protocol.
+- [Architecture Principles](architecture-principles.md)
diff --git a/docs/tech-specs/data-ownership-model.md b/docs/tech-specs/data-ownership-model.md
index ea94ec46..b112d195 100644
--- a/docs/tech-specs/data-ownership-model.md
+++ b/docs/tech-specs/data-ownership-model.md
@@ -22,8 +22,16 @@ are the boundaries around data, and who owns what?
A workspace is the primary isolation boundary. It represents an
organisation, team, or independent operating unit. All data belongs to
-exactly one workspace. Cross-workspace access is never permitted through
-the API.
+exactly one workspace.
+
+Cross-workspace access through the API is gated by the IAM regime
+(see [`iam-contract.md`](iam-contract.md)). In the OSS distribution,
+the role table defined in [`capabilities.md`](capabilities.md)
+permits cross-workspace operation only to the `admin` role; the
+`reader` and `writer` roles are constrained to a single assigned
+workspace per credential. Other regimes can model the relationship
+between identity and workspace differently — the gateway makes no
+assumption.
A workspace owns:
- Source documents
@@ -279,9 +287,18 @@ A typical workflow:
The current codebase uses a `user` field in message metadata and storage
partition keys to identify the workspace. The `collection` field
-identifies the collection within that workspace. The IAM spec describes
-how the gateway maps authenticated credentials to a workspace identity
-and sets these fields.
+identifies the collection within that workspace.
+
+The gateway is the single point at which workspace gets stamped onto
+outbound pub/sub messages. An incoming credential authenticates to a
+workspace (the credential's binding, not a user-to-workspace lookup —
+see [`iam-contract.md`](iam-contract.md) and the *Identity surface*
+section of [`iam.md`](iam.md)); any caller-supplied workspace on the
+request is reconciled against the authenticated identity by the IAM
+regime; the resolved value is what the gateway writes into outgoing
+messages and the storage layers' partition keys. Backend services
+trust the workspace they receive — defense-in-depth happens at the
+gateway, not at the bus.
For details on how each storage backend implements this scoping, see:
@@ -302,7 +319,10 @@ For details on how each storage backend implements this scoping, see:
## References
-- [Identity and Access Management](iam.md)
+- [IAM Contract](iam-contract.md) — gateway↔IAM regime abstraction.
+- [Identity and Access Management](iam.md) — gateway-side framing.
+- [Capability Vocabulary](capabilities.md) — capability strings and
+ the OSS role bundles that decide cross-workspace eligibility.
- [Collection Management](collection-management.md)
- [Entity-Centric Graph](entity-centric-graph.md)
- [Neo4j User Collection Isolation](neo4j-user-collection-isolation.md)
diff --git a/docs/tech-specs/flow-class-definition.md b/docs/tech-specs/flow-blueprint-definition.md
similarity index 100%
rename from docs/tech-specs/flow-class-definition.md
rename to docs/tech-specs/flow-blueprint-definition.md
diff --git a/docs/tech-specs/iam-contract.md b/docs/tech-specs/iam-contract.md
new file mode 100644
index 00000000..da23fb31
--- /dev/null
+++ b/docs/tech-specs/iam-contract.md
@@ -0,0 +1,403 @@
+---
+layout: default
+title: "IAM Contract Technical Specification"
+parent: "Tech Specs"
+---
+
+# IAM Contract Technical Specification
+
+## Overview
+
+The IAM contract is the abstraction between the API gateway and any
+identity / access management regime that fronts it. The gateway
+treats IAM as a black box behind two operations — *authenticate* and
+*authorise* — plus a small surface of management operations. No
+regime-specific concept (roles, scopes, groups, claims, policy
+languages) is visible to the gateway, and no gateway-specific
+concept (capability vocabulary, request anatomy) is visible to
+backend services.
+
+The TrustGraph open-source distribution ships one IAM regime — a
+role-based implementation defined in
+[`iam-protocol.md`](iam-protocol.md) — that is one implementation of
+this contract. Enterprise editions can replace it with a different
+regime (OIDC / SSO, ABAC, ReBAC, external policy engine) without
+changing the gateway, the wire protocol, or the backends.
+
+## Motivation
+
+Authorisation models vary by deployment. A small team might be
+happy with three predefined roles; an enterprise might need group-
+mapping from an upstream IdP, attribute-based policies, or
+relationship-based access control. Hard-wiring any one of those
+into the gateway forces every other regime to either compromise its
+model or be re-implemented.
+
+A narrow contract — "authenticate this credential" and "may this
+identity perform this operation on this resource" — captures what
+the gateway actually needs to know without committing to a policy
+shape. The IAM regime owns the policy decision; the gateway is a
+generic enforcement point.
+
+## Operations
+
+### `authenticate`
+
+```
+authenticate(credential: bytes) → Identity | AuthFailure
+```
+
+Validates a credential the client presented. The gateway treats
+the credential as opaque bytes — for the OSS regime today that's
+either an API key plaintext or a JWT, but the gateway does not
+parse them; the IAM regime decides.
+
+On success, returns an `Identity`. On any failure the IAM regime
+returns the same opaque `AuthFailure` — never a description of which
+condition failed. This is the spec's masked-error rule: an
+attacker probing the endpoint cannot distinguish "no such key",
+"expired", "wrong signature", "revoked", "user disabled", etc.
+
+### `authorise`
+
+```
+authorise(identity: Identity,
+ capability: str,
+ resource: Resource,
+ parameters: dict)
+ → Decision
+```
+
+Asks whether the identity is permitted to perform the named
+capability on the named resource, given the operation's
+parameters. Returns `allow` or `deny`. `identity` is whatever
+`authenticate` returned for this caller; the gateway never
+decomposes it.
+
+The four arguments separate concerns:
+
+- **`identity`** — who is asking.
+- **`capability`** — what permission they are exercising (e.g.
+ `users:write`, `graph:read`). Permission, not structure.
+- **`resource`** — what is being operated on, as a structured
+ identifier. See *The Resource model* below.
+- **`parameters`** — operation-specific data that the regime may
+ need to consider beyond the resource identifier. Used when a
+ decision depends on attributes the request supplies — e.g.
+ creating a user *with workspace association W*: the resource is
+ the system-level user registry, and W is a parameter the regime
+ checks against the caller's permissions for `users:write`.
+
+Different regimes use the four arguments differently — one regime
+might evaluate role bundles whose grants carry workspace scope;
+another might consult upstream IdP group memberships; an ABAC
+regime evaluates a policy with all four as inputs. The contract
+is unchanged.
+
+### `authorise_many`
+
+```
+authorise_many(identity: Identity,
+ checks: list[(str, Resource, dict)])
+ → list[Decision]
+```
+
+Bulk variant of `authorise`. Same semantics, one round-trip for
+many decisions. Used when an operation fans out to multiple
+resources (e.g. an agent that touches several workspaces) and a
+single permission check isn't sufficient.
+
+`authorise_many` is not just a performance optimisation; it pins
+the contract for fan-out operations early, before clients (or
+internal callers) build patterns that assume one-permission-check-
+per-request. Regimes implement it as a loop over `authorise`
+unless they have a more efficient path.
+
+### Management operations
+
+Beyond the request-time `authenticate` / `authorise`, the contract
+also covers identity-lifecycle and credential-lifecycle operations
+that are invoked by administrative requests rather than by the
+authentication path. These are regime-specific in detail (an SSO
+regime that delegates user management to the IdP may not implement
+most of them) but the operation set the gateway can forward is:
+
+- User management: `create-user`, `list-users`, `get-user`,
+ `update-user`, `disable-user`, `enable-user`, `delete-user`
+- Credential management: `create-api-key`, `list-api-keys`,
+ `revoke-api-key`, `change-password`, `reset-password`
+- Workspace management: `create-workspace`, `list-workspaces`,
+ `get-workspace`, `update-workspace`, `disable-workspace`
+- Session management: `login`, `whoami`
+- Key management: `get-signing-key-public`, `rotate-signing-key`
+- Bootstrap: `bootstrap`, `bootstrap-status`
+
+`whoami` is the self-read counterpart to `get-user`: any
+authenticated caller can read their own identity record without
+holding a user-management capability. It is the gating-free probe
+a UI uses to render affordances appropriate to the caller's role.
+
+`bootstrap-status` is a side-effect-free probe of whether an
+unconsumed `bootstrap` call would currently succeed. It exists so
+a first-run UI can decide whether to render setup without invoking
+the consuming `bootstrap` op. Public — no authentication.
+
+A regime that does not support one of these (e.g. an SSO regime
+where users are managed in the IdP) returns a defined "not
+supported" error; the gateway surfaces it as a 501.
+
+### Actor injection
+
+For any management operation forwarded by the gateway after
+authentication, the gateway injects the authenticated caller's
+`handle` as an `actor` field on the request. Regimes use `actor`
+to identify *who is making the request* — distinct from the
+operation's target (which lives in `user_id` / `key_id` /
+`workspace_record` / etc.) — for purposes such as:
+
+- Self-service operations (`whoami`, `change-password`) that
+ resolve "the caller" without taking a target argument.
+- Audit logging, where the actor is recorded against the change.
+- Decisions that depend on the resolved resource state. The
+ gateway authorises against the parameters on the request, but it
+ cannot know the resolved resource's actual properties (e.g. the
+ workspace association of a target user) before the regime loads
+ it. When that matters, the regime can re-decide using the
+ actor's permissions and the resolved record — closing a class
+ of cases the gateway-side check can't see.
+
+Caller-supplied `actor` values on the request body are overwritten
+by the gateway — the gateway is the only authority for actor
+identity, and a regime that consults `actor` can rely on it being
+authentic.
+
+## The `Identity` surface
+
+`Identity` is *mostly* opaque. The gateway holds the value as a
+token to quote back when calling `authorise`, never decomposing it.
+But there are a few gateway-side concerns that need a small
+surface:
+
+| Field | Purpose |
+|---|---|
+| `handle` | Opaque reference passed back to `authorise`. Regime-defined; gateway treats as a string. |
+| `workspace` | The workspace this credential authenticates to. Used by the gateway only as a default-fill-in for operations that omit a workspace. Never used as policy input — when authorisation needs to know which workspace the operation acts on, the operation places it in the resource address (or a parameter), and the regime decides. |
+| `principal_id` | Stable identifier the gateway logs for audit (a user id, a sub claim, a service account id). Never used for authorisation — that's `authorise`'s job. |
+| `source` | How the credential was presented (`api-key`, `jwt`, …). Non-policy; useful for logs and metrics only. |
+
+Anything else — roles, claims, group memberships, policy attributes
+— stays inside the regime and is reachable only via `authorise`.
+
+## The `Resource` model
+
+A `Resource` is a structured value identifying *what is being
+operated on*. Resources live at one of three levels in TrustGraph,
+based on where the resource exists in the deployment:
+
+### Resource levels
+
+| Level | What lives there | Resource shape |
+|---|---|---|
+| **System** | The user registry, the workspace registry, the signing key, the audit log — anything that exists once per deployment. | `{}` |
+| **Workspace** | A workspace's config, flow definitions, library (documents), knowledge cores, collections — things that exist *within* a workspace. | `{workspace: "..."}` |
+| **Flow** | A flow's knowledge graph, agent state, LLM context, embedding state, MCP context — things that exist *within* a flow within a workspace. | `{workspace: "...", flow: "..."}` |
+
+Note carefully:
+
+- **Users are a system-level resource.** A user record exists at
+ the deployment level; the fact that a user has a *workspace
+ association* (one in OSS, possibly many in other regimes) is a
+ property of the user record, not a containment. Operations on
+ the user registry have `resource = {}`; the workspace
+ association appears as a *parameter*, not as a resource address
+ component.
+- **Workspaces themselves are a system-level resource.** The
+ workspace registry exists at the deployment level. `create-
+ workspace` and `list-workspaces` are system-level operations;
+ the workspace identifier in their bodies is a parameter, not an
+ address.
+- **A workspace's contents are workspace-level resources.** A
+ workspace's config, flows, library, etc. live within a
+ workspace. Their resource address is `{workspace: ...}`.
+- **A flow's contents are flow-level resources.** Knowledge
+ graphs, agents, etc. live within a flow. Their resource
+ address is `{workspace: ..., flow: ...}`.
+
+### Component vocabulary
+
+| Component | Type | Meaning | Used by |
+|---|---|---|---|
+| `workspace` | string | Identifier of the workspace whose contents are being operated on | workspace-level and flow-level resource addresses |
+| `flow` | string | Identifier of a flow within a workspace; always paired with `workspace` | flow-level resource addresses |
+| `collection` | string | Reserved for finer-grained scoping within a workspace | future / enterprise |
+| `document` | string | Reserved for per-document scoping | future / enterprise |
+
+A `Resource` is a partial mapping of these components to values.
+The level of the resource (system / workspace / flow) determines
+which components must be present. An empty `{}` is the
+system-level resource.
+
+### Workspace as parameter vs. address
+
+Workspace plays two distinct roles in operations and shows up in
+two distinct places:
+
+- **As a resource address component** — workspace identifies the
+ thing being operated on. Lives in `resource.workspace`. Example:
+ `config:read` reads the config *of* workspace W.
+- **As an operation parameter** — workspace is data the operation
+ acts on or filters by, while the resource itself is system-level.
+ Lives in `parameters.workspace`. Example: `users:write`
+ creates a user *with workspace association* W; the resource is
+ the user registry (system), and W is a parameter.
+
+These are not interchangeable. The IAM regime considers each role
+separately; the OSS role table, for instance, applies workspace-
+scope to the address component when checking workspace-level
+operations, and to a parameter when checking
+"create-user-with-workspace-W". Both end up enforcing the admin's
+scope, but through different code paths.
+
+### Extension rules
+
+The vocabulary is closed but extensible. Adding a new component:
+
+1. The component is added to the vocabulary in this spec, with a
+ defined name, type, and meaning.
+2. Existing IAM regimes ignore unknown components (forward
+ compatibility — adding a new component does not break older
+ regimes that don't understand it).
+3. Older gateways that don't populate a new component leave it
+ unset; regimes that need it for a decision treat "unset" as
+ "absent" and decide accordingly (typically: cannot grant
+ permission scoped to a component the gateway didn't supply).
+
+A regime that wants stricter behaviour (e.g. fail-closed on
+unknown components rather than ignoring them) declares so as part
+of its own configuration; the contract default is "ignore unknown".
+
+## Operation registry (gateway-side)
+
+Mapping a request onto `(capability, resource, parameters)` is
+service-specific — it cannot be inferred from the capability
+alone. The gateway maintains an **operation registry** that
+declares, per operation:
+
+- The required capability.
+- The resource level (system / workspace / flow) — determines the
+ shape of the resource identifier.
+- How to extract the resource address components (workspace,
+ flow) from the request — from URL path, WebSocket envelope, or
+ body.
+- Which body fields are operation parameters (and which of those
+ the IAM regime should see in the `parameters` argument).
+
+This registry is part of the gateway's endpoint declarations, not
+part of the IAM contract. The contract specifies what arguments
+`authorise` receives; how the gateway populates them is its own
+concern.
+
+In the OSS gateway, registry keys follow these conventions:
+
+| Pattern | Used by | Resource level |
+|---|---|---|
+| bare op name (`create-user`, `list-users`, `login`, …) | `/api/v1/iam` and the auth surface | system / workspace, per op |
+| `:` (`config:get`, `flow:list-blueprints`, `librarian:add-document`, …) | `/api/v1/{kind}` (workspace-scoped global services) | workspace |
+| `flow-service:` (`flow-service:agent`, `flow-service:graph-rag`, …) | `/api/v1/flow/{flow}/service/{kind}` and the WS Mux | flow |
+| `flow-import:` / `flow-export:` | `/api/v1/flow/{flow}/{import,export}/{kind}` streaming sockets | flow |
+
+Keys are an OSS-gateway implementation detail — the contract does
+not constrain naming. The conventions above exist so the registry
+key is uniquely derivable from the request path and (where
+applicable) body without ambiguity.
+
+## Caching
+
+Both `authenticate` and `authorise` results are cached at the
+gateway, on different policies:
+
+- **`authenticate`** — cached by a hash of the credential. The OSS
+ gateway uses a fixed short TTL (currently 60 s) so that revoked
+ API keys and disabled users stop working within the TTL window
+ without any push mechanism. Regimes that want a different
+ behaviour can return an `expires` hint with the identity; the
+ gateway honours the smaller of `expires` and its own ceiling.
+
+- **`authorise`** — cached by a hash of `(handle, capability,
+ resource, parameters)`. The regime returns a suggested TTL with
+ the decision; the gateway clamps it above by a deployment-set
+ ceiling (currently 60 s). Both allow and deny decisions are
+ cached; denies briefly, to avoid hammering the regime with
+ repeated rejected attempts.
+
+The TTL ceiling caps the revocation latency window — a role
+revoked at the regime takes effect at the gateway no later than
+the ceiling. Operators that need stricter revocation can lower
+the ceiling.
+
+## Failure modes
+
+| Condition | Behaviour |
+|---|---|
+| `authenticate` returns AuthFailure | Gateway responds 401 with the masked `auth failure` body. |
+| `authorise` returns deny | Gateway responds 403 with the masked `access denied` body. |
+| IAM regime unreachable | Gateway responds 401 / 503 (deployment-defined). No fail-open. |
+| `authorise_many` partial deny | Gateway treats the request as denied; the operation is rejected. Partial-success semantics are not part of the contract. |
+| Regime returns "not supported" for a management operation | Gateway responds 501. |
+
+There is no fallback or "soft" decision path. An IAM regime that
+is unavailable, slow, or returning errors causes requests to fail
+closed.
+
+## Implementations
+
+### Open-source role-based regime
+
+Defined in [`iam-protocol.md`](iam-protocol.md). Implements the
+contract via:
+
+- A pub/sub request/response service (`iam-svc`) reached only by
+ the gateway over the message bus.
+- Credentials are API keys (opaque) or JWTs (Ed25519, locally
+ validated by the gateway against the regime's published public
+ key).
+- `authorise` reduces to a lookup against the role bundles in
+ [`capabilities.md`](capabilities.md), with each grant's workspace
+ scope checked against the operation's workspace component.
+- Identity, user, and workspace records live in Cassandra.
+
+The OSS regime is deliberately simple — three roles, a single
+workspace association per user (a regime data-model decision, not
+a contract assertion), no policy language. Other regimes can
+grant the same user different permissions in different workspaces
+without changing anything outside the regime.
+
+### Future regimes
+
+The contract is shaped to admit, without code change in the
+gateway:
+
+- **OIDC / SSO** — `authenticate` validates an OIDC ID token via
+ the IdP's JWKS; `Identity.handle` carries the verified subject
+ and group claims; `authorise` evaluates against group-to-
+ capability mappings configured at the regime.
+- **ABAC / Policy engine** — `authorise` calls out to a policy
+ engine (Rego, Cedar, custom DSL) with the identity's attributes
+ and the resource as the policy input.
+- **ReBAC (Zanzibar-style)** — `authorise` translates `(identity,
+ capability, resource)` into a relationship-tuple lookup against
+ a tuple store.
+- **Hybrid** — multiple regimes composed: e.g. authenticate via
+ SSO, authorise via local policy.
+
+None of these require gateway changes. The contract surface is
+the same; the regime is what differs.
+
+## References
+
+- [Identity and Access Management Specification](iam.md) — overall
+ design and the gateway-side framing.
+- [IAM Service Protocol Specification](iam-protocol.md) — the OSS
+ regime's wire-level protocol.
+- [Capability Vocabulary Specification](capabilities.md) — the
+ capability strings the gateway uses as `authorise` input.
diff --git a/docs/tech-specs/iam-protocol.md b/docs/tech-specs/iam-protocol.md
new file mode 100644
index 00000000..e7e7984e
--- /dev/null
+++ b/docs/tech-specs/iam-protocol.md
@@ -0,0 +1,386 @@
+---
+layout: default
+title: "IAM Service Protocol Technical Specification"
+parent: "Tech Specs"
+---
+
+# IAM Service Protocol Technical Specification
+
+## Overview
+
+This document specifies the wire protocol of the **open-source IAM
+regime** — one implementation of the abstract IAM contract defined
+in [`iam-contract.md`](iam-contract.md). Other regimes (OIDC / SSO,
+ABAC, ReBAC, external policy engines) implement the same contract
+with different transports, data models, and policy semantics; the
+gateway is unaware of which regime it's wired against.
+
+The OSS regime is a backend processor (`iam-svc`) reached over the
+standard request/response pub/sub pattern. It owns users,
+workspaces, API keys, login credentials, and JWT signing keys, all
+backed by Cassandra. The API gateway is its only caller.
+
+This document defines:
+
+- the `IamRequest` and `IamResponse` dataclasses on the bus,
+- the operation set the OSS regime implements,
+- per-operation input and output fields,
+- the error taxonomy,
+- the bootstrap modes,
+- the initial HTTP forwarding endpoint used while the protocol is
+ being exercised.
+
+The mapping from this regime onto the abstract contract is direct:
+
+| Contract operation | OSS regime operation |
+|---|---|
+| `authenticate(credential)` | `resolve-api-key` (for API keys); local JWT validation against `get-signing-key-public` (for JWTs) |
+| `authorise(identity, capability, resource, parameters)` | Role-table lookup against the OSS role bundles defined in [`capabilities.md`](capabilities.md), gated by workspace scope. Workspace can come from the resource address (workspace- and flow-level resources) or from a parameter (system-level resources whose parameters reference a workspace, e.g. `create-user with workspace association W`). |
+| `authorise_many` | Loop over `authorise` |
+| Identity / credential / workspace management | `create-user`, `create-api-key`, etc. as listed below. These are operations on system-level resources (the user / workspace / credential registries); workspace, where it appears in the body, is a parameter. |
+
+Architectural context — roles, capabilities, workspace as resource
+scope, enforcement boundary — lives in [`iam.md`](iam.md) and
+[`capabilities.md`](capabilities.md). The contract abstraction
+lives in [`iam-contract.md`](iam-contract.md).
+
+## Transport
+
+- **Request topic:** `request:tg/request/iam-request`
+- **Response topic:** `response:tg/response/iam-response`
+- **Pattern:** request/response, correlated by the `id` message
+ property, the same pattern used by `config-svc` and `flow-svc`.
+- **Caller:** the API gateway only. Under the enforcement-boundary
+ policy (see capabilities spec), the IAM service trusts the bus
+ and performs no per-request authentication or capability check
+ against the caller. The gateway has already evaluated capability
+ membership and workspace scoping before sending the request.
+
+## Dataclasses
+
+### `IamRequest`
+
+```python
+@dataclass
+class IamRequest:
+ # One of the operation strings below.
+ operation: str = ""
+
+ # Scope of this request. Required on every workspace-scoped
+ # operation. Omitted (or empty) for system-level ops
+ # (workspace CRUD, signing-key ops, bootstrap, resolve-api-key,
+ # login).
+ workspace: str = ""
+
+ # Acting user id. Set by the gateway to the authenticated
+ # caller's identity handle for every authenticated request
+ # (overwrites any caller-supplied value — the gateway is the
+ # only authority for actor identity, so handlers can rely on it
+ # being authentic). Used for audit logging, self-service ops
+ # like ``whoami`` that resolve "the caller", and future actor-
+ # scoped policy checks. Empty for unauthenticated ops
+ # (``login``, ``bootstrap``, ``bootstrap-status``,
+ # ``get-signing-key-public``, ``resolve-api-key``). See the
+ # actor-injection rule in the IAM contract spec.
+ actor: str = ""
+
+ # --- identity selectors ---
+ user_id: str = ""
+ username: str = "" # login; unique within a workspace
+ key_id: str = "" # revoke-api-key, list-api-keys (own)
+ api_key: str = "" # resolve-api-key (plaintext)
+
+ # --- credentials ---
+ password: str = "" # login, change-password (current)
+ new_password: str = "" # change-password
+
+ # --- user fields ---
+ user: UserInput | None = None # create-user, update-user
+
+ # --- workspace fields ---
+ workspace_record: WorkspaceInput | None = None # create-workspace, update-workspace
+
+ # --- api key fields ---
+ key: ApiKeyInput | None = None # create-api-key
+```
+
+### `IamResponse`
+
+```python
+@dataclass
+class IamResponse:
+ # Populated on success of operations that return them.
+ user: UserRecord | None = None # create-user, get-user, update-user
+ users: list[UserRecord] = field(default_factory=list) # list-users
+ workspace: WorkspaceRecord | None = None # create-workspace, get-workspace, update-workspace
+ workspaces: list[WorkspaceRecord] = field(default_factory=list) # list-workspaces
+
+ # create-api-key returns the plaintext once. Never populated
+ # on any other operation.
+ api_key_plaintext: str = ""
+ api_key: ApiKeyRecord | None = None # create-api-key
+ api_keys: list[ApiKeyRecord] = field(default_factory=list) # list-api-keys
+
+ # login, rotate-signing-key
+ jwt: str = ""
+ jwt_expires: str = "" # ISO-8601 UTC
+
+ # get-signing-key-public
+ signing_key_public: str = "" # PEM
+
+ # resolve-api-key returns who this key authenticates as.
+ resolved_user_id: str = ""
+ resolved_workspace: str = ""
+ resolved_roles: list[str] = field(default_factory=list)
+
+ # reset-password
+ temporary_password: str = "" # returned once to the operator
+
+ # bootstrap: on first run, the initial admin's one-time API key
+ # is returned for the operator to capture.
+ bootstrap_admin_user_id: str = ""
+ bootstrap_admin_api_key: str = ""
+
+ # bootstrap-status: true iff an unconsumed ``bootstrap`` call
+ # would currently succeed. Always emitted by the response
+ # translator (the false case is meaningful for first-run UIs).
+ bootstrap_available: bool = False
+
+ # Present on any failed operation.
+ error: Error | None = None
+```
+
+### Value types
+
+```python
+@dataclass
+class UserInput:
+ username: str = ""
+ name: str = ""
+ email: str = ""
+ password: str = "" # only on create-user; never on update-user
+ roles: list[str] = field(default_factory=list)
+ enabled: bool = True
+ must_change_password: bool = False
+
+@dataclass
+class UserRecord:
+ id: str = ""
+ workspace: str = ""
+ username: str = ""
+ name: str = ""
+ email: str = ""
+ roles: list[str] = field(default_factory=list)
+ enabled: bool = True
+ must_change_password: bool = False
+ created: str = "" # ISO-8601 UTC
+ # Password hash is never included in any response.
+
+@dataclass
+class WorkspaceInput:
+ id: str = ""
+ name: str = ""
+ enabled: bool = True
+
+@dataclass
+class WorkspaceRecord:
+ id: str = ""
+ name: str = ""
+ enabled: bool = True
+ created: str = "" # ISO-8601 UTC
+
+@dataclass
+class ApiKeyInput:
+ user_id: str = ""
+ name: str = "" # operator-facing label, e.g. "laptop"
+ expires: str = "" # optional ISO-8601 UTC; empty = no expiry
+
+@dataclass
+class ApiKeyRecord:
+ id: str = ""
+ user_id: str = ""
+ name: str = ""
+ prefix: str = "" # first 4 chars of plaintext, for identification in lists
+ expires: str = "" # empty = no expiry
+ created: str = ""
+ last_used: str = "" # empty if never used
+ # key_hash is never included in any response.
+```
+
+## Operations
+
+| Operation | Request fields | Response fields | Notes |
+|---|---|---|---|
+| `login` | `username`, `password`, `workspace` (optional) | `jwt`, `jwt_expires` | If `workspace` omitted, IAM resolves to the user's assigned workspace. |
+| `whoami` | `actor` (gateway-injected) | `user` | Returns the calling user's own record. AUTHENTICATED-only; no `users:read` capability required. |
+| `resolve-api-key` | `api_key` (plaintext) | `resolved_user_id`, `resolved_workspace`, `resolved_roles` | Gateway-internal. Service returns `auth-failed` for unknown / expired / revoked keys. |
+| `change-password` | `user_id`, `password` (current), `new_password` | — | Self-service. IAM validates `password` against stored hash. |
+| `reset-password` | `user_id`, `workspace` (optional integrity check) | `temporary_password` | Admin-initiated. IAM generates a random password, sets `must_change_password=true` on the user, returns the plaintext once. |
+| `create-user` | `workspace`, `user` | `user` | `user.password` is hashed and stored; `user.roles` must be subset of known roles. `workspace` is the new user's home-workspace binding (a required *parameter*, not an address). |
+| `list-users` | `workspace` (optional filter) | `users` | If `workspace` omitted, returns the deployment-wide list. |
+| `get-user` | `user_id`, `workspace` (optional integrity check) | `user` | |
+| `update-user` | `user_id`, `user`, `workspace` (optional integrity check) | `user` | `password` field on `user` is rejected; use `change-password` / `reset-password`. Username is immutable. |
+| `disable-user` | `user_id`, `workspace` (optional integrity check) | — | Soft-delete; sets `enabled=false`. Revokes all the user's API keys. |
+| `enable-user` | `user_id`, `workspace` (optional integrity check) | — | Re-enables a previously disabled user; does not restore API keys. |
+| `delete-user` | `user_id`, `workspace` (optional integrity check) | — | Hard-delete; removes user record, username lookup, and all the user's API keys. |
+| `create-workspace` | `workspace_record` | `workspace` | System-level. |
+| `list-workspaces` | — | `workspaces` | System-level. |
+| `get-workspace` | `workspace_record` (id only) | `workspace` | System-level. |
+| `update-workspace` | `workspace_record` | `workspace` | System-level. |
+| `disable-workspace` | `workspace_record` (id only) | — | System-level. Sets `enabled=false`; revokes all workspace API keys; disables all users in the workspace. |
+| `create-api-key` | `key`, `workspace` (optional integrity check) | `api_key_plaintext`, `api_key` | Plaintext returned **once**; only hash stored. `key.name` required. |
+| `list-api-keys` | `user_id`, `workspace` (optional integrity check) | `api_keys` | |
+| `revoke-api-key` | `key_id`, `workspace` (optional integrity check) | — | Deletes the key record. |
+| `get-signing-key-public` | — | `signing_key_public` | Gateway fetches this at startup. |
+| `rotate-signing-key` | — | — | System-level. Introduces a new signing key; old key continues to validate JWTs for a grace period (implementation-defined, minimum 1h). |
+| `bootstrap` | — | `bootstrap_admin_user_id`, `bootstrap_admin_api_key` | If IAM tables are empty and the service is in `bootstrap` mode, creates the initial `default` workspace, an `admin` user, an initial API key, and an initial signing key; returns them once. Otherwise returns a masked auth failure. |
+| `bootstrap-status` | — | `bootstrap_available` | Side-effect-free probe; `true` iff iam-svc is in `bootstrap` mode and tables are empty. Intended for first-run UX. |
+
+## Error taxonomy
+
+All errors are carried in the `IamResponse.error` field. `error.type`
+is one of the values below; `error.message` is a human-readable
+string that is **not** surfaced verbatim to external callers (the
+gateway maps to `auth failure` / `access denied` per the IAM error
+policy).
+
+| `type` | When |
+|---|---|
+| `invalid-argument` | Malformed request (missing required field, unknown operation, invalid format). |
+| `not-found` | Named resource does not exist (`user_id`, `key_id`, workspace). |
+| `duplicate` | Create operation collides with an existing resource (username, workspace id, key name). |
+| `auth-failed` | `login` with wrong credentials; `resolve-api-key` with unknown / expired / revoked key; `change-password` with wrong current password. Single bucket to deny oracle attacks. |
+| `weak-password` | Password does not meet policy (length, complexity — policy defined at service level). |
+| `disabled` | Target user or workspace has `enabled=false`. |
+| `operation-not-permitted` | Non-admin attempting system-level operation, or workspace-scoped operation attempting to affect another workspace. |
+| `internal-error` | Unexpected IAM-side failure. Log and surface as 500 at the gateway. |
+
+The gateway is responsible for translating `auth-failed` and
+`operation-not-permitted` into the obfuscated external error
+response (`"auth failure"` / `"access denied"`); `invalid-argument`
+becomes a descriptive 400; `not-found` / `duplicate` /
+`weak-password` / `disabled` become descriptive 4xx but never leak
+IAM-internal detail.
+
+## Credential storage
+
+- **Passwords** are stored using a slow KDF (bcrypt / argon2id — the
+ service picks; documented as an implementation detail). The
+ `password_hash` column stores the full KDF-encoded string
+ (algorithm, cost, salt, hash). Not a plain SHA-256.
+- **API keys** are stored as SHA-256 of the plaintext. API keys
+ are 128-bit random values (`tg_` + base64url); the entropy
+ makes a slow hash unnecessary. The hash serves as the primary
+ key on the `iam_api_keys` table, enabling O(1) lookup on
+ `resolve-api-key`.
+- **JWT signing key** is stored as an RSA or Ed25519 private key
+ (implementation choice) in a dedicated `iam_signing_keys` table
+ with a `kid`, `created`, and optional `retired` timestamp. At
+ most one active key; up to N retired keys are kept for a grace
+ period to validate previously-issued JWTs.
+
+Passwords, API-key plaintext, and signing-key private material are
+never returned in any response other than the explicit one-time
+responses above (`reset-password`, `create-api-key`, `bootstrap`).
+
+## Bootstrap modes
+
+`iam-svc` requires a bootstrap mode to be chosen at startup. There is
+no default — an unset or invalid mode causes the service to refuse
+to start. The purpose is to force the operator to make an explicit
+security decision rather than rely on an implicit "safe" fallback.
+
+| Mode | Startup behaviour | `bootstrap` operation | Suitability |
+|---|---|---|---|
+| `token` | On first start with empty tables, auto-seeds the `default` workspace, admin user, admin API key (using the operator-provided `--bootstrap-token`), and an initial signing key. No-op on subsequent starts. | Refused — returns `auth-failed` / `"auth failure"` regardless of caller. | Production, any public-exposure deployment. |
+| `bootstrap` | No startup seeding. Tables remain empty until the `bootstrap` operation is invoked over the pub/sub bus (typically via `tg-bootstrap-iam`). | Live while tables are empty. Generates and returns the admin API key once. Refused (`auth-failed`) once tables are populated. | Dev / compose up / CI. **Not safe under public exposure** — any caller reaching the gateway's `/api/v1/iam` forwarder before the operator can cause a token to be issued to them. Operators choosing this mode accept that risk. |
+
+### Error masking
+
+In both modes, any refused invocation of the `bootstrap` operation
+returns the same error (`auth-failed` / `"auth failure"`). A caller
+cannot distinguish:
+
+- "service is in token mode"
+- "service is in bootstrap mode but already bootstrapped"
+- "operation forbidden"
+
+This matches the general IAM error-policy stance (see `iam.md`) and
+prevents externally enumerating IAM's state.
+
+### Configuration sources
+
+The mode and token can be supplied two ways. Resolution order is
+fixed; there is no permissive fallback.
+
+| Source | Field |
+|---|---|
+| Processor-group YAML / CLI argument | `bootstrap_mode`, `bootstrap_token` |
+| Environment variable | `IAM_BOOTSTRAP_MODE`, `IAM_BOOTSTRAP_TOKEN` |
+
+For each setting the service uses the explicit param value if
+present; otherwise the environment variable; otherwise the service
+refuses to start. The env-var path is intended for the K8s
+deployment pattern where the token is injected from a `Secret` via
+`secretKeyRef`, so the plaintext never has to live in YAML or git.
+A typical production manifest holds `bootstrap_mode: "token"` in
+the YAML and pulls `IAM_BOOTSTRAP_TOKEN` from the Secret; the YAML
+is then safe to version-control.
+
+### Bootstrap-token lifecycle
+
+The bootstrap token — whether operator-supplied (`token` mode) or
+service-generated (`bootstrap` mode) — is a one-time credential. It
+is stored as admin's single API key, tagged `name="bootstrap"`. The
+operator's first admin action after bootstrap should be:
+
+1. Create a durable admin user and API key (or issue a durable API
+ key to the bootstrap admin).
+2. Revoke the bootstrap key via `revoke-api-key`.
+3. Remove the bootstrap token from any deployment configuration
+ (Secret, env var, or YAML field — wherever it was sourced).
+
+The `name="bootstrap"` marker makes bootstrap keys easy to detect in
+tooling (e.g. a `tg-list-api-keys` filter).
+
+## HTTP forwarding (initial integration)
+
+For the initial gateway integration — before the IAM service is
+wired into the authentication middleware — the gateway exposes a
+single forwarding endpoint:
+
+```
+POST /api/v1/iam
+```
+
+- Request body is a JSON encoding of `IamRequest`.
+- Response body is a JSON encoding of `IamResponse`.
+- The gateway's existing authentication (`GATEWAY_SECRET` bearer)
+ gates access to this endpoint so the IAM protocol can be
+ exercised end-to-end in tests without touching the live auth
+ path.
+- This endpoint is **not** the final shape. Once the middleware is
+ in place, per-operation REST endpoints replace it (for example
+ `POST /api/v1/auth/login`, `POST /api/v1/users`, `DELETE
+ /api/v1/api-keys/{id}`), and this generic forwarder is removed.
+
+The endpoint performs only message marshalling: it does not read
+or rewrite fields in the request, and it applies no capability
+check. All authorisation for user / workspace / key management
+lands in the subsequent middleware work.
+
+## Non-goals for this spec
+
+- REST endpoint shape for the final gateway surface — covered in
+ Phase 2 of the IAM implementation plan, not here.
+- OIDC / SAML external IdP protocol — out of scope for open source.
+- Key-signing algorithm choice, password KDF choice, JWT claim
+ layout — implementation details captured in code + ADRs, not
+ locked in the protocol spec.
+
+## References
+
+- [IAM Contract Specification](iam-contract.md) — the abstract
+ gateway↔IAM regime contract this protocol implements.
+- [Identity and Access Management Specification](iam.md)
+- [Capability Vocabulary Specification](capabilities.md)
diff --git a/docs/tech-specs/iam.md b/docs/tech-specs/iam.md
index 5de50749..dd0e12f5 100644
--- a/docs/tech-specs/iam.md
+++ b/docs/tech-specs/iam.md
@@ -199,9 +199,9 @@ The server rejects all non-auth messages until authentication succeeds.
The socket remains open on auth failure, allowing the client to retry
with a different token without reconnecting. The client can also send
a new auth message at any time to re-authenticate — for example, to
-refresh an expiring JWT or to switch workspace. The
-resolved identity (user, workspace, roles) is updated on each
-successful auth.
+refresh an expiring JWT or to switch workspace. The resolved
+identity (handle, workspace, principal_id, source) is updated on
+each successful auth.
#### API keys
@@ -219,7 +219,7 @@ For programmatic access: CLI tools, scripts, and integrations.
On each request, the gateway resolves an API key by:
1. Hashing the token.
-2. Checking a local cache (hash → user/workspace/roles).
+2. Checking a local cache (hash → identity).
3. On cache miss, calling the IAM service to resolve.
4. Caching the result with a short TTL (e.g. 60 seconds).
@@ -233,9 +233,15 @@ For interactive access via the UI or WebSocket connections.
- A user logs in with username and password. The gateway forwards the
request to the IAM service, which validates the credentials and
returns a signed JWT.
-- The JWT carries the user ID, workspace, and roles as claims.
+- The JWT carries identity-binding claims only — user id (`sub`)
+ and the workspace this credential authenticates to. No roles,
+ no policy state. Per the IAM contract, all policy decisions go
+ through `authorise`; the gateway never reads roles or other
+ regime-internal state from the credential.
- The gateway validates JWTs locally using the IAM service's public
- signing key — no service call needed on subsequent requests.
+ signing key — no service call needed for the authentication step;
+ authorisation calls remain per-request (cached per the contract's
+ caching rules).
- Token expiry is enforced by standard JWT validation at the time the
request (or WebSocket connection) is made.
- For long-lived WebSocket connections, the JWT is validated at connect
@@ -262,6 +268,26 @@ The gateway forwards this to the IAM service, which validates
credentials and returns a signed JWT. The gateway returns the JWT to
the caller.
+#### Self-service: `whoami` and `bootstrap-status`
+
+Two side-effect-free probes that exist to support UI affordances
+without giving the caller broad read access:
+
+- `POST /api/v1/iam` with `{"operation": "whoami"}` — authenticated
+ only. Returns the caller's own user record (id, username, name,
+ email, workspace, roles, enabled, must_change_password,
+ created). No `users:read` capability is required, because every
+ authenticated caller can read themselves. The gateway populates
+ `actor` on the request from the authenticated identity, so the
+ regime resolves "the caller" without taking a target argument.
+
+- `POST /api/v1/auth/bootstrap-status` — public, side-effect-free.
+ Returns `{"bootstrap_available": true|false}`. `true` iff
+ iam-svc is in `bootstrap` mode and its tables are empty (i.e. an
+ unconsumed `bootstrap` call would currently succeed). Exists so
+ a first-run UI can decide whether to render the setup flow
+ without invoking the consuming `bootstrap` op.
+
#### IAM service delegation
The gateway stays thin. Its authentication logic is:
@@ -285,35 +311,82 @@ authentication uses API keys or JWTs. On first start, the bootstrap
process creates a default workspace and admin user with an initial API
key.
-### User identity
+### Identity, credentials, and workspace binding
-A user belongs to exactly one workspace. The design supports extending
-this to multi-workspace access in the future (see
-[Extension points](#extension-points)).
+The gateway never asks "which workspace does *this user* belong to?".
+That question forces every IAM regime to expose a user-to-workspace
+mapping, which prevents regimes where the relationship is many-to-many
+or doesn't exist (e.g. SSO with IdP-driven workspace selection).
+Instead, the gateway asks "which workspace does *this credential*
+authenticate to?" — a question every regime can answer in its own
+terms.
-A user record contains:
+A credential (API key, JWT, OIDC token, etc.) is **bound to a
+workspace at issue time**. The IAM regime decides what binding
+means:
+
+- **OSS regime** — each user has a home workspace; credentials
+ issued to that user are bound to that workspace. A 1:1
+ user-to-workspace constraint is an internal data-model decision,
+ not a contract assertion.
+- **Multi-workspace regime** (future / enterprise) — a user with
+ access to several workspaces gets a different credential per
+ workspace. Each credential authenticates to exactly one
+ workspace; the relationship between user and workspace is a
+ regime-internal detail the gateway does not see.
+
+When the gateway authenticates a credential, the IAM regime returns
+an `Identity` whose `workspace` is the workspace this credential is
+for. That value — not "the user's workspace" — is what the gateway
+uses for default-fill-in and as input to the IAM `authorise` call.
+
+#### Identity surface
+
+What the gateway holds after `authenticate`:
+
+| Field | Purpose |
+|-------|---------|
+| `handle` | Opaque token quoted back when calling `authorise`. Regime-defined. |
+| `workspace` | The workspace this credential authenticates to. Used as the default if a request omits workspace. |
+| `principal_id` | Stable identifier for audit logging (a user id, sub claim, service account id). Never used for authorisation. |
+| `source` | How the credential was presented (`api-key`, `jwt`). Logged with audit events; not policy input. |
+
+Anything else — roles, claims, group memberships, policy attributes
+— stays inside the regime and is reachable only via `authorise`.
+See [`iam-contract.md`](iam-contract.md) for the full contract.
+
+#### OSS user record
+
+The OSS regime stores the following per user. These fields are
+**OSS-implementation specifics**, not part of the contract.
| Field | Type | Description |
|-------|------|-------------|
| `id` | string | Unique user identifier (UUID) |
| `name` | string | Display name |
| `email` | string | Email address (optional) |
-| `workspace` | string | Workspace the user belongs to |
+| `workspace` | string | Home workspace; default binding for issued credentials |
| `roles` | list[string] | Assigned roles (e.g. `["reader"]`) |
| `enabled` | bool | Whether the user can authenticate |
| `created` | datetime | Account creation timestamp |
-The `workspace` field maps to the existing `user` field in `Metadata`.
-This means the storage-layer isolation (Cassandra, Neo4j, Qdrant
-filtering by `user` + `collection`) works without changes — the gateway
-sets the `user` metadata field to the authenticated user's workspace.
+The `workspace` field on a user record is the **default binding**
+used when issuing credentials, not a constraint visible to the
+gateway. An enterprise regime may have no user records at all
+(authentication delegated to an IdP).
### Workspaces
-A workspace is an isolated data boundary. Users belong to a workspace,
-and all data operations are scoped to it. Workspaces map to the existing
-`user` field in `Metadata` and the corresponding Cassandra keyspace,
-Qdrant collection prefix, and Neo4j property filters.
+A workspace is an isolated data boundary — a tenancy scope in which
+users, flows, configuration, documents, and knowledge graphs live.
+Workspaces map to storage-layer isolation: the `user` field in
+`Metadata`, the corresponding Cassandra keyspace, the Qdrant
+collection prefix, the Neo4j property filter.
+
+Workspace is the most prominent component of an operation's
+**resource scope**: when a request says "do X to Y", workspace is
+part of "Y". Listing users, creating flows, querying the graph —
+all of these target a specific workspace.
| Field | Type | Description |
|-------|------|-------------|
@@ -322,57 +395,176 @@ Qdrant collection prefix, and Neo4j property filters.
| `enabled` | bool | Whether the workspace is active |
| `created` | datetime | Creation timestamp |
-All data operations are scoped to a workspace. The gateway determines
-the effective workspace for each request as follows:
+#### Default-fill-in
-1. If the request includes a `workspace` parameter, validate it against
- the user's assigned workspace.
- - If it matches, use it.
- - If it does not match, return 403. (This could be extended to
- check a workspace access grant list.)
-2. If no `workspace` parameter is provided, use the user's assigned
- workspace.
+If a request omits workspace, the gateway fills it in from the
+authenticated identity's bound workspace (`identity.workspace`)
+before any IAM check runs. IAM never receives an unresolved
+workspace; every `authorise` call sees a concrete value.
-The gateway sets the `user` field in `Metadata` to the effective
-workspace ID, replacing the caller-supplied `?user=` query parameter.
+#### Authorisation
-This design ensures forward compatibility. Clients that pass a
-workspace parameter will work unchanged if multi-workspace support is
-added later. Requests for an unassigned workspace get a clear 403
-rather than silent misbehaviour.
+Whether the resolved workspace is permitted to be operated on by
+this caller is an **IAM decision**, not a gateway one. The gateway
+calls `authorise(identity, capability, {workspace: ..., ...})` and
+relays the answer. In the OSS regime, the regime checks whether
+the caller's permission grants for `` include this
+workspace — see [`capabilities.md`](capabilities.md). In other
+regimes the decision could come from group mappings, policies,
+relationship tuples, or anything else the regime models.
+
+### Request anatomy
+
+The shape of a request — where workspace appears, where flow
+appears, where parameters live — follows from **the level of the
+resource being operated on**, not from any single property of the
+request like its URL or its required capability.
+
+Resources live at one of three levels (see also the resource model
+in [`iam-contract.md`](iam-contract.md)):
+
+| Resource level | Examples | Resource address |
+|---|---|---|
+| **System** | The user registry, the workspace registry, the IAM signing key, the audit log | empty `{}` |
+| **Workspace** | A workspace's config, flow definitions, library, knowledge cores, collections | `{workspace: ...}` |
+| **Flow** | A flow's knowledge graph, agent state, LLM context, embeddings, MCP context | `{workspace: ..., flow: ...}` |
+
+For the gateway-to-bus mapping this dictates **where workspace
+lives in the message**, but only when workspace is part of the
+*resource address*. Workspace can also appear as an *operation
+parameter* on system-level resources (see below).
+
+#### Workspace as address vs. parameter
+
+Two distinct roles, two distinct locations:
+
+- **Workspace as address component.** Workspace identifies the
+ thing being operated on. Used for workspace-level and flow-level
+ resources. Lives in the addressing layer of the message — the
+ URL path for HTTP, or the WebSocket envelope alongside `flow` for
+ flow-scoped operations sent through the Mux.
+- **Workspace as operation parameter.** Workspace is data the
+ operation acts on, while the resource itself is system-level.
+ Used for operations on the user registry (`create-user with
+ workspace association W`), the workspace registry (`create-
+ workspace W`), and other system-level operations that happen to
+ reference a workspace. Lives in the request body or inner WS
+ payload alongside the operation's other parameters.
+
+The two roles never coexist on the same operation. Either the
+operation addresses something within a workspace (workspace is in
+the address) or it operates on a system-level resource with
+workspace as a parameter (workspace is in the body) or workspace
+is irrelevant (system-level operations like `bootstrap`,
+`rotate-signing-key`, `login` itself).
+
+#### Where workspace lives, by request type
+
+| Request type | Resource level | Workspace lives in |
+|---|---|---|
+| Flow-scoped data plane (`agent`, `graph-rag`, `llm`, `embeddings`, `mcp`, etc.) | Flow | Envelope alongside `flow` (WS) or URL path (HTTP) — part of the address |
+| Workspace-scoped control plane (`config`, `library`, `knowledge`, `collection-management`, flow lifecycle) | Workspace | Body / inner request — part of the address |
+| User registry ops (`create-user`, `list-users`, `disable-user`, etc.) | System | Body — as a *parameter* (the user's workspace association or a list filter) |
+| Workspace registry ops (`create-workspace`, `list-workspaces`, etc.) | System | Body — as a *parameter* (the workspace identifier in `workspace_record`) |
+| Credential ops (`create-api-key`, `revoke-api-key`, `change-password`, `reset-password`) | System | Body — as a *parameter* on ops that have one; absent on `change-password` (target is the caller's identity) |
+| System ops (`bootstrap`, `login`, `rotate-signing-key`, `get-signing-key-public`) | System | Not present at all |
+
+The classification is deliberate. Users are a global concept that
+*have* a workspace; they don't *live* in one. An OSS regime has
+1:1 user-to-workspace; a multi-workspace regime maps a user to many
+workspaces; an SSO regime might delegate workspace membership to an
+IdP entirely. The gateway treats user-registry operations as
+system-level so the contract is the same across regimes — the
+workspace association is a parameter the regime interprets in its
+own terms.
+
+#### HTTP
+
+HTTP routes by URL path, so the address lives in the URL.
+Per-operation REST shape:
+
+- Flow-level: `POST /api/v1/workspaces/{w}/flows/{f}/services/{kind}`
+ — `workspace` and `flow` are URL components.
+- Workspace-level: `POST /api/v1/workspaces/{w}/config`,
+ `/api/v1/workspaces/{w}/library`, etc. — `workspace` is a URL
+ component.
+- System-level: `POST /api/v1/users`, `/api/v1/workspaces`, etc. —
+ no workspace in URL; if the operation references one, it's a
+ field in the body.
+
+`/api/v1/iam` is itself registry-driven: the body's `operation`
+field is looked up against the registry to obtain the capability,
+resource shape, and parameter shape per operation, rather than
+gating the whole endpoint with a single coarse capability.
+
+#### WebSocket Mux
+
+The Mux envelope is the addressing layer for flow-scoped
+operations. For workspace-level and system-level operations the
+envelope routes by `service` only, and the inner request payload
+carries the address components or parameters as appropriate. See
+[`iam-contract.md`](iam-contract.md) for the operation-registry
+mechanism the Mux uses to know which fields to read.
### Roles and access control
-Three roles with fixed permissions:
+Roles are an OSS-regime concept and live entirely in the IAM
+service. The gateway does not enumerate or check them — it asks
+`authorise(identity, capability, resource, parameters)` per
+request and the regime maps the caller's roles to a decision.
-| Role | Data operations | Admin operations | System |
-|------|----------------|-----------------|--------|
-| `reader` | Query knowledge graph, embeddings, RAG | None | None |
-| `writer` | All reader operations + load documents, manage collections | None | None |
-| `admin` | All writer operations | Config, flows, collection management, user management | Metrics |
+The OSS regime ships three roles:
-Role checks happen at the gateway before dispatching to backend
-services. Each endpoint declares the minimum role required:
+| Role | Capabilities granted |
+|------|----------------------|
+| `reader` | Read capabilities on data and config (`graph:read`, `documents:read`, `rows:read`, `config:read`, `flows:read`, `knowledge:read`, `collections:read`, `keys:self`, plus the per-service caps `agent`, `llm`, `embeddings`, `mcp`). |
+| `writer` | All reader capabilities, plus `graph:write`, `documents:write`, `rows:write`, `knowledge:write`, `collections:write`. |
+| `admin` | All writer capabilities, plus `config:write`, `flows:write`, `users:read`, `users:write`, `users:admin`, `keys:admin`, `workspaces:admin`, `iam:admin`, `metrics:read`. |
-| Endpoint pattern | Minimum role |
-|-----------------|--------------|
-| `GET /api/v1/socket` (queries) | `reader` |
-| `POST /api/v1/librarian` | `writer` |
-| `POST /api/v1/flow/*/import/*` | `writer` |
-| `POST /api/v1/config` | `admin` |
-| `GET /api/v1/flow/*` | `admin` |
-| `GET /api/metrics` | `admin` |
+Workspace scope is a property of the *grant*, not of the user or
+role. In the OSS regime each capability granted by `reader` /
+`writer` is scoped to the workspace the user record is associated
+with; capabilities granted by `admin` are scoped to `*` (every
+workspace). A user is a system-level object — they don't "live
+in" a workspace, they hold permissions whose scope happens to
+reference one.
-Roles are hierarchical: `admin` implies `writer`, which implies
-`reader`.
+The OSS regime is deliberately limited to one workspace association
+per user; future regimes are free to grant the same user different
+permissions in different workspaces, or use a non-workspace scope
+entirely. This is regime-internal — neither the contract nor the
+gateway carries an assumption either way.
+
+The gateway gates each endpoint by *capability*, not by role.
+Capabilities are declared per operation in the gateway's operation
+registry; see [`iam-contract.md`](iam-contract.md) for the
+registry mechanism and [`capabilities.md`](capabilities.md) for
+the capability vocabulary.
### IAM service
-The IAM service is a new backend service that manages all identity and
-access data. It is the authority for users, workspaces, API keys, and
-credentials. The gateway delegates to it.
+The IAM service is a backend service that implements the
+[IAM contract](iam-contract.md) — `authenticate`, `authorise`, and
+the management operations the gateway forwards. It is the
+authority for identity, credential validation, and access decisions.
+The gateway treats it as a black box behind the contract; nothing
+in the gateway is regime-specific.
-#### Data model
+The OSS distribution ships one IAM regime: a role-based service
+backed by Cassandra, described in
+[`iam-protocol.md`](iam-protocol.md). Enterprise / future regimes
+can replace this implementation without changing the gateway, the
+wire protocol between gateway and backends, or the capability
+vocabulary — see the contract spec for the abstraction the gateway
+is wired against and the implementation notes for what other
+regimes look like.
+
+#### OSS data model
+
+The OSS regime stores users, workspaces, API keys, and signing
+keys in Cassandra. This is an **OSS regime implementation
+detail**; it is not part of the contract. Other regimes will have
+different (or no) data models.
```
iam_workspaces (
@@ -423,44 +615,89 @@ resolve API keys and to handle login requests. User management
operations (create user, revoke key, etc.) also go through the IAM
service.
+### Error policy
+
+External error responses carry **no diagnostic detail** for
+authentication or access-control failures. The goal is to give an
+attacker probing the endpoint no signal about which condition they
+tripped.
+
+| Category | HTTP | Body | WebSocket frame |
+|----------|------|------|-----------------|
+| Authentication failure | `401 Unauthorized` | `{"error": "auth failure"}` | `{"type": "auth-failed", "error": "auth failure"}` |
+| Access control failure | `403 Forbidden` | `{"error": "access denied"}` | `{"error": "access denied"}` (endpoint-specific frame type) |
+
+"Authentication failure" covers missing credential, malformed
+credential, invalid signature, expired token, revoked API key, and
+unknown API key — all indistinguishable to the caller.
+
+"Access control failure" covers role insufficient, workspace
+mismatch, user disabled, and workspace disabled — all
+indistinguishable to the caller.
+
+**Server-side logging is richer.** The audit log records the specific
+reason (`"workspace-mismatch: user alice assigned 'acme', requested
+'beta'"`, `"role-insufficient: admin required, user has writer"`,
+etc.) for operators and post-incident forensics. These messages never
+appear in responses.
+
+Other error classes (bad request, internal error) remain descriptive
+because they do not reveal anything about the auth or access-control
+surface — e.g. `"missing required field 'workspace'"` or
+`"invalid JSON"` is fine.
+
### Gateway changes
-The current `Authenticator` class is replaced with a thin authentication
-middleware that delegates to the IAM service:
+The current `Authenticator` class is replaced with a thin
+authentication+authorisation middleware that delegates to the IAM
+service per the IAM contract. The gateway performs no role check
+itself — authorisation is asked of the regime via `authorise`.
For HTTP requests:
1. Extract Bearer token from the `Authorization` header.
2. If the token has JWT format (dotted structure):
- Validate signature locally using the cached public key.
- - Extract user ID, workspace, and roles from claims.
+ - Build an `Identity` from `sub` and `workspace` claims (no
+ other claims are consulted).
3. Otherwise, treat as an API key:
- Hash the token and check the local cache.
- - On cache miss, call the IAM service to resolve.
- - Cache the result (user/workspace/roles) with a short TTL.
+ - On cache miss, call the IAM service to resolve to an
+ `Identity` (handle, workspace, principal_id, source).
+ - Cache the result with a short TTL.
4. If neither succeeds, return 401.
-5. If the user or workspace is disabled, return 403.
-6. Check the user's role against the endpoint's minimum role. If
- insufficient, return 403.
-7. Resolve the effective workspace:
- - If the request includes a `workspace` parameter, validate it
- against the user's assigned workspace. Return 403 on mismatch.
- - If no `workspace` parameter, use the user's assigned workspace.
-8. Set the `user` field in the request context to the effective
- workspace ID. This propagates through `Metadata` to all downstream
- services.
+5. Look up the operation in the gateway's operation registry to get
+ `(capability, resource_level, extractors)`. Build the resource
+ address (system / workspace / flow level) and parameters from
+ the request.
+6. Default-fill the workspace into the body when the operation is
+ workspace- or flow-level (so downstream code sees a single
+ canonical address); the resource address keeps its supplied
+ value.
+7. Call `authorise(identity, capability, resource, parameters)`.
+ On allow, forward the request; on deny, return 403. On regime
+ error, fail closed (401 / 503 per deployment).
+8. Cache the decision per the contract's caching rules (clamped
+ above by a deployment-set ceiling).
+9. For requests forwarded to iam-svc, set `actor` on the body
+ from `identity.handle`, overwriting any caller-supplied value.
+ See [`iam-contract.md`](iam-contract.md#actor-injection).
For WebSocket connections:
1. Accept the connection in an unauthenticated state.
2. Wait for an auth message (`{"type": "auth", "token": "..."}`).
-3. Validate the token using the same logic as steps 2-7 above.
+3. Validate the token using the same logic as steps 1-3 above.
4. On success, attach the resolved identity to the connection and
send `{"type": "auth-ok", ...}`.
5. On failure, send `{"type": "auth-failed", ...}` but keep the
socket open.
6. Reject all non-auth messages until authentication succeeds.
7. Accept new auth messages at any time to re-authenticate.
+8. For each subsequent request frame, look up
+ `flow-service:` in the registry and call `authorise`
+ against the `{workspace, flow}` resource — same authority
+ gateway HTTP callers see, evaluated per-frame.
### CLI changes
@@ -713,6 +950,16 @@ These are not implemented but the architecture does not preclude them:
- **Multi-workspace access.** Users could be granted access to
additional workspaces beyond their primary assignment. The workspace
validation step checks a grant list instead of a single assignment.
+- **Workspace resolver.** Workspace resolution on each authenticated
+ request — "given this user and this requested workspace, which
+ workspace (if any) may the request operate on?" — is encapsulated
+ in a single pluggable resolver. The open-source edition ships a
+ resolver that permits only the user's single assigned workspace;
+ enterprise editions that implement multi-workspace access swap in a
+ resolver that consults a permitted set. The wire protocol (the
+ optional `workspace` field on the authenticated request) is
+ identical in both editions, so clients written against one edition
+ work unchanged against the other.
- **Rules-based access control.** A separate access control service
could evaluate fine-grained policies (per-collection permissions,
operation-level restrictions, time-based access). The gateway
@@ -848,10 +1095,15 @@ service, not in the config service. Reasons:
- **API key scoping.** API keys could be scoped to specific collections
within a workspace rather than granting workspace-wide access. To be
designed when the need arises.
-- **tg-init-trustgraph** only initialises a single workspace.
## References
+- [IAM Contract Specification](iam-contract.md) — the gateway↔IAM
+ regime abstraction this design is wired against.
+- [IAM Service Protocol Specification](iam-protocol.md) — the OSS
+ regime's wire-level protocol.
+- [Capability Vocabulary Specification](capabilities.md) — the
+ capability strings the gateway uses as `authorise` input.
- [Data Ownership and Information Separation](data-ownership-model.md)
- [MCP Tool Bearer Token Specification](mcp-tool-bearer-token.md)
- [Multi-Tenant Support Specification](multi-tenant-support.md)
diff --git a/iam-testing.txt b/iam-testing.txt
new file mode 100644
index 00000000..0d03ffc3
--- /dev/null
+++ b/iam-testing.txt
@@ -0,0 +1,252 @@
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation": "bootstrap"}'
+
+
+
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation": "resolve-api-key", "api_key": "tg_r-n43hDWV9WOY06w6o5YpevAxirlS33D"}'
+
+
+
+
+
+
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation": "resolve-api-key", "api_key": "asdalsdjasdkasdasda"}'
+
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"list-users","workspace":"default"}'
+
+
+
+ # 1. Admin creates a writer user "alice"
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{
+ "operation": "create-user",
+ "workspace": "default",
+ "user": {
+ "username": "alice",
+ "name": "Alice",
+ "email": "alice@example.com",
+ "password": "changeme",
+ "roles": ["writer"]
+ }
+ }'
+ # expect: {"user": {"id": "", ...}} — grab alice's uuid
+
+ # 2. Issue alice an API key
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{
+ "operation": "create-api-key",
+ "workspace": "default",
+ "key": {
+ "user_id": "f2363a10-3b83-44ea-a008-43caae8ba607",
+ "name": "alice-laptop"
+ }
+ }'
+ # expect: {"api_key_plaintext": "tg_...", "api_key": {"id": "", "prefix": "tg_xxxx", ...}}
+
+ # 3. Resolve alice's key — should return alice's id + workspace + writer role
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"resolve-api-key","api_key":"tg_gt4buvk5NG-QS7oP_0Gk5yTWyj1qensf"}'
+
+ # expect: {"resolved_user_id":"","resolved_workspace":"default","resolved_roles":["writer"]}
+
+ # 4. List alice's keys (admin view of alice's keys)
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"list-api-keys","workspace":"default","user_id":"f2363a10-3b83-44ea-a008-43caae8ba607"}'
+ # expect: {"api_keys": [{"id":"","user_id":"","name":"alice-laptop","prefix":"tg_xxxx",...}]}
+
+ # 5. Revoke alice's key
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"revoke-api-key","workspace":"default","key_id":"55f1c1f7-5448-49fd-9eda-56c192b61177"}'
+
+
+ # expect: {} (empty, no error)
+
+ # 6. Confirm the revoked key no longer resolves
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"resolve-api-key","api_key":"tg_gt4buvk5NG-QS7oP_0Gk5yTWyj1qensf"}'
+ # expect: {"error":{"type":"auth-failed","message":"unknown api key"}}
+
+
+
+----------------------------------------------------------------------------
+
+ You'll want to re-bootstrap a fresh deployment to pick up the new signing-key row (or accept that login will lazily generate one on first
+ call). Then:
+
+ # 1. Create a user with a known password (admin's password is random)
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"create-user","workspace":"default","user":{"username":"alice","password":"s3cret","roles":["writer"]}}'
+
+
+
+ # 2. Log alice in
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"login","username":"alice","password":"s3cret"}'
+ # expect: {"jwt":"eyJ...","jwt_expires":"2026-..."}
+
+ # 3. Fetch the public key (what the gateway will use later to verify)
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"get-signing-key-public"}'
+
+ # expect: {"signing_key_public":"-----BEGIN PUBLIC KEY-----\n..."}
+
+ # 4. Wrong password
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Authorization: Bearer $GATEWAY_SECRET" \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"login","username":"alice","password":"nope"}'
+
+
+
+ # expect: {"error":{"type":"auth-failed","message":"bad credentials"}}
+
+
+
+
+
+-----BEGIN PUBLIC KEY-----
+MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAseLB/a9Bo/RN/Rb/x763
++vdxmUKG75oWsXBmbwZGDXyN6fwqZ3L7cEje93qK0PYFuCHxhY1Hn0gW7FZ8ovH+
+qEksekUlpfPYqKGiT5Mb0DKk49D4yKkIbJFugWalpwIilvRbQO0jy3V8knqGQ1xL
+NfNYFrI2Rxe0Tq2OHVYc5YwYbyj1nz2TY5fd9qrzXtGRv5HZztkl25lWhRvG9G0K
+urKDdBDbi894gIYorXvcwZw/b1GDXG/aUy/By1Oy3hXnCLsN8pA3nA437TTTWxHx
+QgPH15jIF9hezO+3/ESZ7EhVEtgmwTxPddfXRa0ZoT6JyWOgcloKtnP4Lp9eQ4va
+yQIDAQAB
+-----END PUBLIC KEY-----
+
+
+
+
+
+ New operations:
+ - change-password — self-service. Requires current + new password.
+ - reset-password — admin-driven. Generates a random temporary, sets must_change_password=true, returns plaintext once.
+ - get-user, update-user, disable-user — workspace-scoped. update-user refuses to change username (immutable — error if different) and refuses
+ password-via-update. disable-user also revokes all the user's API keys, per spec.
+ - create-workspace, list-workspaces, get-workspace, update-workspace, disable-workspace — system-level. disable-workspace cascades: disables
+ all users + revokes all their keys. Rejects ids starting with _ (reserved, per the bootstrap framework convention).
+ - rotate-signing-key — generates a new Ed25519 key, retires the current one (sets retired timestamp; row stays for future grace-period
+ validation), switches the in-memory cache.
+
+ Touched files:
+ - trustgraph-flow/trustgraph/tables/iam.py — added retire_signing_key, update_user_profile, update_user_password, update_user_enabled,
+ update_workspace.
+ - trustgraph-flow/trustgraph/iam/service/iam.py — 12 new handlers + dispatch entries.
+ - trustgraph-base/trustgraph/base/iam_client.py — matching client helpers for all of them.
+
+ Smoke-test suggestions:
+
+ # change password for alice (from "s3cret" → "n3wer")
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"change-password","user_id":"b2960feb-caef-401d-af65-01bdb6960cad","password":"s3cret","new_password":"n3wer"}'
+
+ # login with new password
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"login","username":"alice","password":"n3wer"}'
+
+ # admin resets alice's password
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"reset-password","workspace":"default","user_id":"b2960feb-caef-401d-af65-01bdb6960cad"}'
+
+
+ # → {"temporary_password":"..."}
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"login","username":"alice","password":"fH2ttyrIcVXCIkH_"}'
+
+
+ # create a second workspace
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"create-workspace","workspace_record":{"id":"acme","name":"Acme Corp","enabled":true}}'
+
+
+ # rotate signing key (next login produces a JWT signed by a new kid)
+
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"rotate-signing-key"}'
+
+
+
+
+
+
+ curl -s -X POST "http://localhost:8088/api/v1/flow" \
+ -H "Authorization: Bearer tg_bs_kBAhfejiEJmbcO1gElbxk3MpV7wQFygP" \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"list-flows"}'
+
+ curl -s -X POST "http://localhost:8088/api/v1/iam" \
+ -H "Authorization: Bearer tg_bs_kBAhfejiEJmbcO1gElbxk3MpV7wQFygP" \
+ -H "Content-Type: application/json" \
+ -d '{"operation":"list-users"}'
+
+
+
+ curl -s -X POST http://localhost:8088/api/v1/iam \
+ -H "Content-Type: application/json" \
+ -H "Authorization: Bearer tg_bs_kBAhfejiEJmbcO1gElbxk3MpV7wQFygP" \
+ -d '{
+ "operation": "create-user",
+ "workspace": "default",
+ "user": {
+ "username": "alice",
+ "name": "Alice",
+ "email": "alice@example.com",
+ "password": "s3cret",
+ "roles": ["writer"]
+ }
+ }'
+
+
+
+
+ # Login (public, no token needed) → returns a JWT
+ curl -s -X POST "http://localhost:8088/api/v1/auth/login" \
+ -H "Content-Type: application/json" \
+ -d '{"username":"alice","password":"s3cret"}'
+
+
+
+ export TRUSTGRAPH_TOKEN=$(tg-bootstrap-iam) # on fresh bootstrap-mode deployment
+ # or set to your existing admin API key
+
+ tg-create-user --username alice --roles writer
+ # → prints alice's user id
+
+ ALICE_ID=
+
+ ALICE_KEY=$(tg-create-api-key --user-id $ALICE_ID --name alice-laptop)
+ # → alice's plaintext API key
+
+ tg-list-users
+ tg-list-api-keys --user-id $ALICE_ID
+
+ tg-revoke-api-key --key-id <...>
+ tg-disable-user --user-id $ALICE_ID
+
+ # User self-service:
+ tg-login --username alice # prompts for password, prints JWT
+ tg-change-password # prompts for current + new
+
+
diff --git a/tests/unit/test_embeddings/test_ollama_dynamic_model.py b/tests/unit/test_embeddings/test_ollama_dynamic_model.py
index d52a58c6..cfbc4d6e 100644
--- a/tests/unit/test_embeddings/test_ollama_dynamic_model.py
+++ b/tests/unit/test_embeddings/test_ollama_dynamic_model.py
@@ -14,13 +14,13 @@ from trustgraph.embeddings.ollama.processor import Processor
class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase):
"""Test Ollama dynamic model selection"""
- @patch('trustgraph.embeddings.ollama.processor.Client')
+ @patch('trustgraph.embeddings.ollama.processor.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__')
async def test_client_initialized_with_host(self, mock_embeddings_init, mock_async_init, mock_client_class):
"""Test that Ollama client is initialized with correct host"""
# Arrange
- mock_ollama_client = Mock()
+ mock_ollama_client = AsyncMock()
mock_response = Mock()
mock_response.embeddings = [[0.1, 0.2, 0.3, 0.4, 0.5]]
mock_ollama_client.embed.return_value = mock_response
@@ -36,13 +36,13 @@ class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase):
mock_client_class.assert_called_once_with(host="http://localhost:11434")
assert processor.default_model == "test-model"
- @patch('trustgraph.embeddings.ollama.processor.Client')
+ @patch('trustgraph.embeddings.ollama.processor.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__')
async def test_on_embeddings_uses_default_model(self, mock_embeddings_init, mock_async_init, mock_client_class):
"""Test that on_embeddings uses default model when no model specified"""
# Arrange
- mock_ollama_client = Mock()
+ mock_ollama_client = AsyncMock()
mock_response = Mock()
mock_response.embeddings = [[0.1, 0.2, 0.3, 0.4, 0.5]]
mock_ollama_client.embed.return_value = mock_response
@@ -62,13 +62,13 @@ class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase):
)
assert result == [[0.1, 0.2, 0.3, 0.4, 0.5]]
- @patch('trustgraph.embeddings.ollama.processor.Client')
+ @patch('trustgraph.embeddings.ollama.processor.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__')
async def test_on_embeddings_uses_specified_model(self, mock_embeddings_init, mock_async_init, mock_client_class):
"""Test that on_embeddings uses specified model when provided"""
# Arrange
- mock_ollama_client = Mock()
+ mock_ollama_client = AsyncMock()
mock_response = Mock()
mock_response.embeddings = [[0.1, 0.2, 0.3, 0.4, 0.5]]
mock_ollama_client.embed.return_value = mock_response
@@ -88,13 +88,13 @@ class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase):
)
assert result == [[0.1, 0.2, 0.3, 0.4, 0.5]]
- @patch('trustgraph.embeddings.ollama.processor.Client')
+ @patch('trustgraph.embeddings.ollama.processor.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__')
async def test_multiple_model_switches(self, mock_embeddings_init, mock_async_init, mock_client_class):
"""Test switching between multiple models"""
# Arrange
- mock_ollama_client = Mock()
+ mock_ollama_client = AsyncMock()
mock_response = Mock()
mock_response.embeddings = [[0.1, 0.2, 0.3, 0.4, 0.5]]
mock_ollama_client.embed.return_value = mock_response
@@ -118,13 +118,13 @@ class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase):
assert calls[2][1]['model'] == "model-a"
assert calls[3][1]['model'] == "test-model" # Default
- @patch('trustgraph.embeddings.ollama.processor.Client')
+ @patch('trustgraph.embeddings.ollama.processor.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__')
async def test_none_model_uses_default(self, mock_embeddings_init, mock_async_init, mock_client_class):
"""Test that None model parameter falls back to default"""
# Arrange
- mock_ollama_client = Mock()
+ mock_ollama_client = AsyncMock()
mock_response = Mock()
mock_response.embeddings = [[0.1, 0.2, 0.3, 0.4, 0.5]]
mock_ollama_client.embed.return_value = mock_response
@@ -143,13 +143,13 @@ class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase):
input=["test text"]
)
- @patch('trustgraph.embeddings.ollama.processor.Client')
+ @patch('trustgraph.embeddings.ollama.processor.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__')
async def test_initialization_without_model_uses_default(self, mock_embeddings_init, mock_async_init, mock_client_class):
"""Test initialization without model parameter uses module default"""
# Arrange
- mock_ollama_client = Mock()
+ mock_ollama_client = AsyncMock()
mock_client_class.return_value = mock_ollama_client
mock_async_init.return_value = None
mock_embeddings_init.return_value = None
diff --git a/tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py b/tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py
index bae6bdbd..6a2048a5 100644
--- a/tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py
+++ b/tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py
@@ -277,6 +277,60 @@ class TestTripleValidation:
is_invalid = extractor.is_valid_triple(subject, predicate, object_val, sample_ontology_subset, entity_types_invalid)
assert not is_invalid, "Invalid range should be rejected"
+ def test_is_valid_triple_subclass_is_accepted(self, extractor, sample_ontology_subset):
+ """Domain check passes when actual type is a subclass of expected."""
+ sample_ontology_subset.classes["Cake"] = {
+ "uri": "http://purl.org/ontology/fo/Cake",
+ "type": "owl:Class",
+ "subclass_of": "Recipe",
+ }
+ sample_ontology_subset.object_properties["has_ingredient"] = {
+ "domain": "Recipe",
+ "range": "Ingredient",
+ }
+
+ result = extractor.is_valid_triple(
+ subject="cake:lemon-drizzle",
+ predicate="has_ingredient",
+ object_val="ingredient:lemon",
+ ontology_subset=sample_ontology_subset,
+ entity_types={"cake:lemon-drizzle": "Cake", "ingredient:lemon": "Ingredient"},
+ )
+
+ assert result is True
+
+ def test_is_valid_triple_handles_subclass_cycle_without_infinite_loop(self, extractor, sample_ontology_subset):
+ """A cycle in subclass_of must return False instead of hanging."""
+ sample_ontology_subset.classes["A"] = {"subclass_of": "B"}
+ sample_ontology_subset.classes["B"] = {"subclass_of": "A"}
+ sample_ontology_subset.object_properties["p"] = {"domain": "Recipe", "range": "Ingredient"}
+
+ result = extractor.is_valid_triple(
+ subject="entity:x",
+ predicate="p",
+ object_val="ingredient:y",
+ ontology_subset=sample_ontology_subset,
+ entity_types={"entity:x": "A", "ingredient:y": "Ingredient"},
+ )
+
+ assert result is False
+
+ def test_is_valid_triple_entity_types_none_default(self, extractor, sample_ontology_subset):
+ """entity_types=None should not raise; domain/range checks skip if type unknown."""
+ sample_ontology_subset.object_properties["has_ingredient"] = {
+ "domain": "Recipe",
+ "range": "Ingredient",
+ }
+
+ result = extractor.is_valid_triple(
+ subject="recipe:x",
+ predicate="has_ingredient",
+ object_val="ingredient:y",
+ ontology_subset=sample_ontology_subset,
+ )
+
+ assert result is True
+
class TestTripleParsing:
"""Test suite for parsing triples from LLM responses."""
@@ -377,6 +431,24 @@ class TestTripleParsing:
assert triple.p.type == IRI, "Predicate should be IRI type"
assert triple.o.type == LITERAL, "Object literal should be LITERAL type"
+ def test_parse_and_validate_triples_collects_entity_types_from_rdf_type(self, extractor, sample_ontology_subset):
+ """entity_types should be built from rdf:type triples in the same batch."""
+ sample_ontology_subset.object_properties["has_ingredient"] = {
+ "domain": "Recipe",
+ "range": "Ingredient",
+ }
+ triples_response = [
+ {"subject": "recipe:cornish-pasty", "predicate": "rdf:type", "object": "Recipe"},
+ {"subject": "ingredient:beef", "predicate": "rdf:type", "object": "Ingredient"},
+ {"subject": "recipe:cornish-pasty", "predicate": "has_ingredient", "object": "ingredient:beef"},
+ ]
+
+ valid_triples = extractor.parse_and_validate_triples(
+ triples_response, sample_ontology_subset
+ )
+
+ assert len(valid_triples) == 3
+
class TestURIExpansionInExtraction:
"""Test suite for URI expansion during triple extraction."""
diff --git a/tests/unit/test_gateway/test_auth.py b/tests/unit/test_gateway/test_auth.py
index d4d4fc2b..26e93fd9 100644
--- a/tests/unit/test_gateway/test_auth.py
+++ b/tests/unit/test_gateway/test_auth.py
@@ -1,69 +1,447 @@
"""
-Tests for Gateway Authentication
+Tests for gateway/auth.py — IamAuth, JWT verification, API key
+resolution cache.
+
+JWTs are signed with real Ed25519 keypairs generated per-test, so
+the crypto path is exercised end-to-end without mocks. API-key
+resolution is tested against a stubbed IamClient since the real
+one requires pub/sub.
"""
+import base64
+import json
+import time
+from unittest.mock import AsyncMock, Mock, patch
+
import pytest
+from aiohttp import web
+from cryptography.hazmat.primitives import serialization
+from cryptography.hazmat.primitives.asymmetric import ed25519
-from trustgraph.gateway.auth import Authenticator
+from trustgraph.gateway.auth import (
+ IamAuth, Identity,
+ _b64url_decode, _verify_jwt_eddsa,
+ API_KEY_CACHE_TTL,
+)
-class TestAuthenticator:
- """Test cases for Authenticator class"""
+# -- helpers ---------------------------------------------------------------
- def test_authenticator_initialization_with_token(self):
- """Test Authenticator initialization with valid token"""
- auth = Authenticator(token="test-token-123")
-
- assert auth.token == "test-token-123"
- assert auth.allow_all is False
- def test_authenticator_initialization_with_allow_all(self):
- """Test Authenticator initialization with allow_all=True"""
- auth = Authenticator(allow_all=True)
-
- assert auth.token is None
- assert auth.allow_all is True
+def _b64url(data: bytes) -> str:
+ return base64.urlsafe_b64encode(data).rstrip(b"=").decode("ascii")
- def test_authenticator_initialization_without_token_raises_error(self):
- """Test Authenticator initialization without token raises RuntimeError"""
- with pytest.raises(RuntimeError, match="Need a token"):
- Authenticator()
- def test_authenticator_initialization_with_empty_token_raises_error(self):
- """Test Authenticator initialization with empty token raises RuntimeError"""
- with pytest.raises(RuntimeError, match="Need a token"):
- Authenticator(token="")
+def make_keypair():
+ priv = ed25519.Ed25519PrivateKey.generate()
+ public_pem = priv.public_key().public_bytes(
+ encoding=serialization.Encoding.PEM,
+ format=serialization.PublicFormat.SubjectPublicKeyInfo,
+ ).decode("ascii")
+ return priv, public_pem
- def test_permitted_with_allow_all_returns_true(self):
- """Test permitted method returns True when allow_all is enabled"""
- auth = Authenticator(allow_all=True)
-
- # Should return True regardless of token or roles
- assert auth.permitted("any-token", []) is True
- assert auth.permitted("different-token", ["admin"]) is True
- assert auth.permitted(None, ["user"]) is True
- def test_permitted_with_matching_token_returns_true(self):
- """Test permitted method returns True with matching token"""
- auth = Authenticator(token="secret-token")
-
- # Should return True when tokens match
- assert auth.permitted("secret-token", []) is True
- assert auth.permitted("secret-token", ["admin", "user"]) is True
+def sign_jwt(priv, claims, alg="EdDSA"):
+ header = {"alg": alg, "typ": "JWT", "kid": "kid-test"}
+ h = _b64url(json.dumps(header, separators=(",", ":"), sort_keys=True).encode())
+ p = _b64url(json.dumps(claims, separators=(",", ":"), sort_keys=True).encode())
+ signing_input = f"{h}.{p}".encode("ascii")
+ if alg == "EdDSA":
+ sig = priv.sign(signing_input)
+ else:
+ raise ValueError(f"test helper doesn't sign {alg}")
+ return f"{h}.{p}.{_b64url(sig)}"
- def test_permitted_with_non_matching_token_returns_false(self):
- """Test permitted method returns False with non-matching token"""
- auth = Authenticator(token="secret-token")
-
- # Should return False when tokens don't match
- assert auth.permitted("wrong-token", []) is False
- assert auth.permitted("different-token", ["admin"]) is False
- assert auth.permitted(None, ["user"]) is False
- def test_permitted_with_token_and_allow_all_returns_true(self):
- """Test permitted method with both token and allow_all set"""
- auth = Authenticator(token="test-token", allow_all=True)
-
- # allow_all should take precedence
- assert auth.permitted("any-token", []) is True
- assert auth.permitted("wrong-token", ["admin"]) is True
\ No newline at end of file
+def make_request(auth_header):
+ """Minimal stand-in for an aiohttp request — IamAuth only reads
+ ``request.headers["Authorization"]``."""
+ req = Mock()
+ req.headers = {}
+ if auth_header is not None:
+ req.headers["Authorization"] = auth_header
+ return req
+
+
+# -- pure helpers ----------------------------------------------------------
+
+
+class TestB64UrlDecode:
+
+ def test_round_trip_without_padding(self):
+ data = b"hello"
+ encoded = _b64url(data)
+ assert _b64url_decode(encoded) == data
+
+ def test_handles_various_lengths(self):
+ for s in (b"a", b"ab", b"abc", b"abcd", b"abcde"):
+ assert _b64url_decode(_b64url(s)) == s
+
+
+# -- JWT verification -----------------------------------------------------
+
+
+class TestVerifyJwtEddsa:
+
+ def test_valid_jwt_passes(self):
+ priv, pub = make_keypair()
+ claims = {
+ "sub": "user-1", "workspace": "default",
+ "iat": int(time.time()),
+ "exp": int(time.time()) + 60,
+ }
+ token = sign_jwt(priv, claims)
+ got = _verify_jwt_eddsa(token, pub)
+ assert got["sub"] == "user-1"
+ assert got["workspace"] == "default"
+
+ def test_expired_jwt_rejected(self):
+ priv, pub = make_keypair()
+ claims = {
+ "sub": "user-1", "workspace": "default",
+ "iat": int(time.time()) - 3600,
+ "exp": int(time.time()) - 1,
+ }
+ token = sign_jwt(priv, claims)
+ with pytest.raises(ValueError, match="expired"):
+ _verify_jwt_eddsa(token, pub)
+
+ def test_bad_signature_rejected(self):
+ priv_a, _ = make_keypair()
+ _, pub_b = make_keypair()
+ claims = {
+ "sub": "user-1", "workspace": "default",
+ "iat": int(time.time()),
+ "exp": int(time.time()) + 60,
+ }
+ token = sign_jwt(priv_a, claims)
+ # pub_b never signed this token.
+ with pytest.raises(Exception):
+ _verify_jwt_eddsa(token, pub_b)
+
+ def test_malformed_jwt_rejected(self):
+ _, pub = make_keypair()
+ with pytest.raises(ValueError, match="malformed"):
+ _verify_jwt_eddsa("not-a-jwt", pub)
+
+ def test_unsupported_algorithm_rejected(self):
+ priv, pub = make_keypair()
+ # Manually build an "alg":"HS256" header — no signer needed
+ # since we expect it to bail before verifying.
+ header = {"alg": "HS256", "typ": "JWT", "kid": "x"}
+ payload = {
+ "sub": "user-1", "workspace": "default",
+ "iat": int(time.time()), "exp": int(time.time()) + 60,
+ }
+ h = _b64url(json.dumps(header, separators=(",", ":")).encode())
+ p = _b64url(json.dumps(payload, separators=(",", ":")).encode())
+ sig = _b64url(b"not-a-real-sig")
+ token = f"{h}.{p}.{sig}"
+ with pytest.raises(ValueError, match="unsupported alg"):
+ _verify_jwt_eddsa(token, pub)
+
+
+# -- Identity --------------------------------------------------------------
+
+
+class TestIdentity:
+
+ def test_fields(self):
+ i = Identity(
+ handle="u", workspace="w",
+ principal_id="u", source="api-key",
+ )
+ assert i.handle == "u"
+ assert i.workspace == "w"
+ assert i.principal_id == "u"
+ assert i.source == "api-key"
+
+
+# -- IamAuth.authenticate --------------------------------------------------
+
+
+class TestIamAuthDispatch:
+ """``authenticate()`` chooses between the JWT and API-key paths
+ by shape of the bearer."""
+
+ @pytest.mark.asyncio
+ async def test_no_authorization_header_raises_401(self):
+ auth = IamAuth(backend=Mock())
+ with pytest.raises(web.HTTPUnauthorized):
+ await auth.authenticate(make_request(None))
+
+ @pytest.mark.asyncio
+ async def test_non_bearer_header_raises_401(self):
+ auth = IamAuth(backend=Mock())
+ with pytest.raises(web.HTTPUnauthorized):
+ await auth.authenticate(make_request("Basic whatever"))
+
+ @pytest.mark.asyncio
+ async def test_empty_bearer_raises_401(self):
+ auth = IamAuth(backend=Mock())
+ with pytest.raises(web.HTTPUnauthorized):
+ await auth.authenticate(make_request("Bearer "))
+
+ @pytest.mark.asyncio
+ async def test_unknown_format_raises_401(self):
+ # Not tg_... and not dotted-JWT shape.
+ auth = IamAuth(backend=Mock())
+ with pytest.raises(web.HTTPUnauthorized):
+ await auth.authenticate(make_request("Bearer garbage"))
+
+ @pytest.mark.asyncio
+ async def test_valid_jwt_resolves_to_identity(self):
+ priv, pub = make_keypair()
+ claims = {
+ "sub": "user-1", "workspace": "default",
+ "iat": int(time.time()),
+ "exp": int(time.time()) + 60,
+ }
+ token = sign_jwt(priv, claims)
+
+ auth = IamAuth(backend=Mock())
+ auth._signing_public_pem = pub
+
+ ident = await auth.authenticate(
+ make_request(f"Bearer {token}")
+ )
+ assert ident.handle == "user-1"
+ assert ident.workspace == "default"
+ assert ident.principal_id == "user-1"
+ assert ident.source == "jwt"
+
+ @pytest.mark.asyncio
+ async def test_jwt_without_public_key_fails(self):
+ # If the gateway hasn't fetched IAM's public key yet, JWTs
+ # must not validate — even ones that would otherwise pass.
+ priv, _ = make_keypair()
+ claims = {
+ "sub": "user-1", "workspace": "default",
+ "iat": int(time.time()), "exp": int(time.time()) + 60,
+ }
+ token = sign_jwt(priv, claims)
+ auth = IamAuth(backend=Mock())
+ # _signing_public_pem defaults to None
+ with pytest.raises(web.HTTPUnauthorized):
+ await auth.authenticate(make_request(f"Bearer {token}"))
+
+ @pytest.mark.asyncio
+ async def test_api_key_path(self):
+ auth = IamAuth(backend=Mock())
+
+ async def fake_resolve(api_key):
+ assert api_key == "tg_testkey"
+ # Roles are returned by the regime as a hint but the
+ # gateway ignores them — kept here so the resolve
+ # protocol shape is exercised.
+ return ("user-xyz", "default", ["admin"])
+
+ async def fake_with_client(op):
+ return await op(Mock(resolve_api_key=fake_resolve))
+
+ with patch.object(auth, "_with_client", side_effect=fake_with_client):
+ ident = await auth.authenticate(
+ make_request("Bearer tg_testkey")
+ )
+ assert ident.handle == "user-xyz"
+ assert ident.workspace == "default"
+ assert ident.principal_id == "user-xyz"
+ assert ident.source == "api-key"
+
+ @pytest.mark.asyncio
+ async def test_api_key_rejection_masked_as_401(self):
+ auth = IamAuth(backend=Mock())
+
+ async def fake_with_client(op):
+ raise RuntimeError("auth-failed: unknown api key")
+
+ with patch.object(auth, "_with_client", side_effect=fake_with_client):
+ with pytest.raises(web.HTTPUnauthorized):
+ await auth.authenticate(
+ make_request("Bearer tg_bogus")
+ )
+
+
+# -- API key cache ---------------------------------------------------------
+
+
+class TestApiKeyCache:
+
+ @pytest.mark.asyncio
+ async def test_cache_hit_skips_iam(self):
+ auth = IamAuth(backend=Mock())
+ calls = {"n": 0}
+
+ async def fake_with_client(op):
+ calls["n"] += 1
+ return await op(Mock(
+ resolve_api_key=AsyncMock(
+ return_value=("u", "default", ["reader"]),
+ )
+ ))
+
+ with patch.object(auth, "_with_client", side_effect=fake_with_client):
+ await auth.authenticate(make_request("Bearer tg_k1"))
+ await auth.authenticate(make_request("Bearer tg_k1"))
+ await auth.authenticate(make_request("Bearer tg_k1"))
+
+ # Only the first lookup reaches IAM; the rest are cache hits.
+ assert calls["n"] == 1
+
+ @pytest.mark.asyncio
+ async def test_different_keys_are_separately_cached(self):
+ auth = IamAuth(backend=Mock())
+ seen = []
+
+ async def fake_with_client(op):
+ async def resolve(plaintext):
+ seen.append(plaintext)
+ return ("u-" + plaintext, "default", ["reader"])
+ return await op(Mock(resolve_api_key=resolve))
+
+ with patch.object(auth, "_with_client", side_effect=fake_with_client):
+ a = await auth.authenticate(make_request("Bearer tg_a"))
+ b = await auth.authenticate(make_request("Bearer tg_b"))
+
+ assert a.handle == "u-tg_a"
+ assert b.handle == "u-tg_b"
+ assert seen == ["tg_a", "tg_b"]
+
+ @pytest.mark.asyncio
+ async def test_cache_has_ttl_constant_set(self):
+ # Not a behaviour test — just ensures we don't accidentally
+ # set TTL to 0 (which would defeat the cache) or to a week.
+ assert 10 <= API_KEY_CACHE_TTL <= 3600
+
+
+# -- IamAuth.authorise -----------------------------------------------------
+
+
+class TestAuthorise:
+ """``authorise()`` is the gateway's only authorisation entry
+ point under the IAM contract. It calls iam-svc, caches the
+ decision for the regime's TTL (clamped above), and raises 403
+ on deny / 401 on regime error (fail closed)."""
+
+ def _make_identity(self, handle="u-1", workspace="default"):
+ return Identity(
+ handle=handle, workspace=workspace,
+ principal_id=handle, source="api-key",
+ )
+
+ @pytest.mark.asyncio
+ async def test_allow_returns_no_exception(self):
+ auth = IamAuth(backend=Mock())
+
+ async def fake_with_client(op):
+ return await op(Mock(
+ authorise=AsyncMock(return_value=(True, 30)),
+ ))
+
+ with patch.object(auth, "_with_client", side_effect=fake_with_client):
+ await auth.authorise(
+ self._make_identity(),
+ "graph:read",
+ {"workspace": "default"},
+ {},
+ )
+
+ @pytest.mark.asyncio
+ async def test_deny_raises_403(self):
+ auth = IamAuth(backend=Mock())
+
+ async def fake_with_client(op):
+ return await op(Mock(
+ authorise=AsyncMock(return_value=(False, 30)),
+ ))
+
+ with patch.object(auth, "_with_client", side_effect=fake_with_client):
+ with pytest.raises(web.HTTPForbidden):
+ await auth.authorise(
+ self._make_identity(),
+ "users:admin",
+ {},
+ {"workspace": "acme"},
+ )
+
+ @pytest.mark.asyncio
+ async def test_regime_error_fails_closed_as_401(self):
+ # If iam-svc errors, the gateway must NOT silently allow.
+ auth = IamAuth(backend=Mock())
+
+ async def fake_with_client(op):
+ raise RuntimeError("iam-svc down")
+
+ with patch.object(auth, "_with_client", side_effect=fake_with_client):
+ with pytest.raises(web.HTTPUnauthorized):
+ await auth.authorise(
+ self._make_identity(),
+ "graph:read",
+ {"workspace": "default"},
+ {},
+ )
+
+ @pytest.mark.asyncio
+ async def test_allow_decision_is_cached(self):
+ auth = IamAuth(backend=Mock())
+ calls = {"n": 0}
+
+ async def fake_with_client(op):
+ calls["n"] += 1
+ return await op(Mock(
+ authorise=AsyncMock(return_value=(True, 30)),
+ ))
+
+ with patch.object(auth, "_with_client", side_effect=fake_with_client):
+ ident = self._make_identity()
+ for _ in range(5):
+ await auth.authorise(
+ ident, "graph:read", {"workspace": "default"}, {},
+ )
+
+ assert calls["n"] == 1
+
+ @pytest.mark.asyncio
+ async def test_deny_decision_is_cached(self):
+ auth = IamAuth(backend=Mock())
+ calls = {"n": 0}
+
+ async def fake_with_client(op):
+ calls["n"] += 1
+ return await op(Mock(
+ authorise=AsyncMock(return_value=(False, 30)),
+ ))
+
+ with patch.object(auth, "_with_client", side_effect=fake_with_client):
+ ident = self._make_identity()
+ for _ in range(5):
+ with pytest.raises(web.HTTPForbidden):
+ await auth.authorise(
+ ident, "users:admin", {}, {"workspace": "acme"},
+ )
+
+ # Denies are cached too — repeated attempts don't re-hit IAM.
+ assert calls["n"] == 1
+
+ @pytest.mark.asyncio
+ async def test_different_resources_cached_separately(self):
+ auth = IamAuth(backend=Mock())
+ calls = {"n": 0}
+
+ async def fake_with_client(op):
+ calls["n"] += 1
+ return await op(Mock(
+ authorise=AsyncMock(return_value=(True, 30)),
+ ))
+
+ with patch.object(auth, "_with_client", side_effect=fake_with_client):
+ ident = self._make_identity()
+ await auth.authorise(
+ ident, "graph:read", {"workspace": "a"}, {},
+ )
+ await auth.authorise(
+ ident, "graph:read", {"workspace": "b"}, {},
+ )
+
+ # Different resource → different cache key → two IAM calls.
+ assert calls["n"] == 2
diff --git a/tests/unit/test_gateway/test_capabilities.py b/tests/unit/test_gateway/test_capabilities.py
new file mode 100644
index 00000000..102e381e
--- /dev/null
+++ b/tests/unit/test_gateway/test_capabilities.py
@@ -0,0 +1,171 @@
+"""
+Tests for gateway/capabilities.py — the thin authorisation surface
+under the IAM contract.
+
+The gateway no longer holds policy state (roles, capability sets,
+workspace scopes); those live in iam-svc. These tests cover only
+what the gateway shim does itself: PUBLIC / AUTHENTICATED short-
+circuiting, default-fill of workspace, and forwarding of capability
+checks to ``auth.authorise``.
+"""
+
+import pytest
+from aiohttp import web
+from unittest.mock import AsyncMock, MagicMock
+
+from trustgraph.gateway.capabilities import (
+ PUBLIC, AUTHENTICATED,
+ enforce, enforce_workspace,
+ access_denied, auth_failure,
+)
+
+
+# -- test fixtures ---------------------------------------------------------
+
+
+class _Identity:
+ """Stand-in for auth.Identity — under the IAM contract it has
+ just ``handle``, ``workspace``, ``principal_id``, ``source``."""
+
+ def __init__(self, handle="user-1", workspace="default"):
+ self.handle = handle
+ self.workspace = workspace
+ self.principal_id = handle
+ self.source = "api-key"
+
+
+def _allow_auth(identity=None):
+ """Build an Auth double that authenticates to ``identity`` and
+ allows every authorise() call."""
+ auth = MagicMock()
+ auth.authenticate = AsyncMock(
+ return_value=identity or _Identity(),
+ )
+ auth.authorise = AsyncMock(return_value=None)
+ return auth
+
+
+def _deny_auth(identity=None):
+ """Build an Auth double that authenticates but denies authorise."""
+ auth = MagicMock()
+ auth.authenticate = AsyncMock(
+ return_value=identity or _Identity(),
+ )
+ auth.authorise = AsyncMock(side_effect=access_denied())
+ return auth
+
+
+# -- enforce() -------------------------------------------------------------
+
+
+class TestEnforce:
+
+ @pytest.mark.asyncio
+ async def test_public_returns_none_no_auth(self):
+ auth = _allow_auth()
+ result = await enforce(MagicMock(), auth, PUBLIC)
+ assert result is None
+ auth.authenticate.assert_not_called()
+ auth.authorise.assert_not_called()
+
+ @pytest.mark.asyncio
+ async def test_authenticated_skips_authorise(self):
+ identity = _Identity()
+ auth = _allow_auth(identity)
+ result = await enforce(MagicMock(), auth, AUTHENTICATED)
+ assert result is identity
+ auth.authenticate.assert_awaited_once()
+ auth.authorise.assert_not_called()
+
+ @pytest.mark.asyncio
+ async def test_capability_calls_authorise_system_level(self):
+ identity = _Identity()
+ auth = _allow_auth(identity)
+ result = await enforce(MagicMock(), auth, "graph:read")
+ assert result is identity
+ auth.authorise.assert_awaited_once_with(
+ identity, "graph:read", {}, {},
+ )
+
+ @pytest.mark.asyncio
+ async def test_capability_denied_raises_forbidden(self):
+ auth = _deny_auth()
+ with pytest.raises(web.HTTPForbidden):
+ await enforce(MagicMock(), auth, "users:admin")
+
+
+# -- enforce_workspace() ---------------------------------------------------
+
+
+class TestEnforceWorkspace:
+
+ @pytest.mark.asyncio
+ async def test_default_fills_from_identity(self):
+ data = {"operation": "x"}
+ auth = _allow_auth()
+ await enforce_workspace(data, _Identity(workspace="default"), auth)
+ assert data["workspace"] == "default"
+
+ @pytest.mark.asyncio
+ async def test_caller_supplied_workspace_kept(self):
+ data = {"workspace": "acme", "operation": "x"}
+ auth = _allow_auth()
+ await enforce_workspace(data, _Identity(workspace="default"), auth)
+ assert data["workspace"] == "acme"
+
+ @pytest.mark.asyncio
+ async def test_no_capability_skips_authorise(self):
+ data = {"workspace": "default"}
+ auth = _allow_auth()
+ await enforce_workspace(data, _Identity(), auth)
+ auth.authorise.assert_not_called()
+
+ @pytest.mark.asyncio
+ async def test_capability_calls_authorise_with_resource(self):
+ data = {"workspace": "acme"}
+ identity = _Identity()
+ auth = _allow_auth(identity)
+ await enforce_workspace(
+ data, identity, auth, capability="graph:read",
+ )
+ auth.authorise.assert_awaited_once_with(
+ identity, "graph:read", {"workspace": "acme"}, {},
+ )
+
+ @pytest.mark.asyncio
+ async def test_capability_denied_propagates(self):
+ data = {"workspace": "acme"}
+ auth = _deny_auth()
+ with pytest.raises(web.HTTPForbidden):
+ await enforce_workspace(
+ data, _Identity(), auth, capability="users:admin",
+ )
+
+ @pytest.mark.asyncio
+ async def test_non_dict_passthrough(self):
+ auth = _allow_auth()
+ result = await enforce_workspace("not-a-dict", _Identity(), auth)
+ assert result == "not-a-dict"
+ auth.authorise.assert_not_called()
+
+
+# -- helpers ---------------------------------------------------------------
+
+
+class TestResponseHelpers:
+
+ def test_auth_failure_is_401(self):
+ exc = auth_failure()
+ assert exc.status == 401
+ assert "auth failure" in exc.text
+
+ def test_access_denied_is_403(self):
+ exc = access_denied()
+ assert exc.status == 403
+ assert "access denied" in exc.text
+
+
+class TestSentinels:
+
+ def test_public_and_authenticated_are_distinct(self):
+ assert PUBLIC != AUTHENTICATED
diff --git a/tests/unit/test_gateway/test_dispatch_manager.py b/tests/unit/test_gateway/test_dispatch_manager.py
index f091a46d..e399d712 100644
--- a/tests/unit/test_gateway/test_dispatch_manager.py
+++ b/tests/unit/test_gateway/test_dispatch_manager.py
@@ -42,7 +42,7 @@ class TestDispatcherManager:
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
assert manager.backend == mock_backend
assert manager.config_receiver == mock_config_receiver
@@ -59,7 +59,10 @@ class TestDispatcherManager:
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver, prefix="custom-prefix")
+ manager = DispatcherManager(
+ mock_backend, mock_config_receiver,
+ auth=Mock(), prefix="custom-prefix",
+ )
assert manager.prefix == "custom-prefix"
@@ -68,7 +71,7 @@ class TestDispatcherManager:
"""Test start_flow method"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
flow_data = {"name": "test_flow", "steps": []}
@@ -82,7 +85,7 @@ class TestDispatcherManager:
"""Test stop_flow method"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
# Pre-populate with a flow
flow_data = {"name": "test_flow", "steps": []}
@@ -96,7 +99,7 @@ class TestDispatcherManager:
"""Test dispatch_global_service returns DispatcherWrapper"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
wrapper = manager.dispatch_global_service()
@@ -107,7 +110,7 @@ class TestDispatcherManager:
"""Test dispatch_core_export returns DispatcherWrapper"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
wrapper = manager.dispatch_core_export()
@@ -118,7 +121,7 @@ class TestDispatcherManager:
"""Test dispatch_core_import returns DispatcherWrapper"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
wrapper = manager.dispatch_core_import()
@@ -130,7 +133,7 @@ class TestDispatcherManager:
"""Test process_core_import method"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
with patch('trustgraph.gateway.dispatch.manager.CoreImport') as mock_core_import:
mock_importer = Mock()
@@ -148,7 +151,7 @@ class TestDispatcherManager:
"""Test process_core_export method"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
with patch('trustgraph.gateway.dispatch.manager.CoreExport') as mock_core_export:
mock_exporter = Mock()
@@ -166,7 +169,7 @@ class TestDispatcherManager:
"""Test process_global_service method"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
manager.invoke_global_service = AsyncMock(return_value="global_result")
@@ -181,7 +184,7 @@ class TestDispatcherManager:
"""Test invoke_global_service with existing dispatcher"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
# Pre-populate with existing dispatcher
mock_dispatcher = Mock()
@@ -198,7 +201,7 @@ class TestDispatcherManager:
"""Test invoke_global_service creates new dispatcher"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
with patch('trustgraph.gateway.dispatch.manager.global_dispatchers') as mock_dispatchers:
mock_dispatcher_class = Mock()
@@ -230,7 +233,7 @@ class TestDispatcherManager:
"""Test dispatch_flow_import returns correct method"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
result = manager.dispatch_flow_import()
@@ -240,7 +243,7 @@ class TestDispatcherManager:
"""Test dispatch_flow_export returns correct method"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
result = manager.dispatch_flow_export()
@@ -250,7 +253,7 @@ class TestDispatcherManager:
"""Test dispatch_socket returns correct method"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
result = manager.dispatch_socket()
@@ -260,7 +263,7 @@ class TestDispatcherManager:
"""Test dispatch_flow_service returns DispatcherWrapper"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
wrapper = manager.dispatch_flow_service()
@@ -272,7 +275,7 @@ class TestDispatcherManager:
"""Test process_flow_import with valid flow and kind"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
# Setup test flow
manager.flows[("default", "test_flow")] = {
@@ -308,7 +311,7 @@ class TestDispatcherManager:
"""Test process_flow_import with invalid flow"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
params = {"flow": "invalid_flow", "kind": "triples"}
@@ -323,7 +326,7 @@ class TestDispatcherManager:
warnings.simplefilter("ignore", RuntimeWarning)
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
# Setup test flow
manager.flows[("default", "test_flow")] = {
@@ -345,7 +348,7 @@ class TestDispatcherManager:
"""Test process_flow_export with valid flow and kind"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
# Setup test flow
manager.flows[("default", "test_flow")] = {
@@ -378,26 +381,47 @@ class TestDispatcherManager:
@pytest.mark.asyncio
async def test_process_socket(self):
- """Test process_socket method"""
+ """process_socket constructs a Mux with the manager's auth
+ instance passed through — this is the gateway's trust path
+ for first-frame WebSocket authentication. A Mux cannot be
+ built without auth (tested separately); this test pins that
+ the dispatcher-manager threads the correct auth value into
+ the Mux constructor call."""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
-
+ mock_auth = Mock()
+ manager = DispatcherManager(
+ mock_backend, mock_config_receiver, auth=mock_auth,
+ )
+
with patch('trustgraph.gateway.dispatch.manager.Mux') as mock_mux:
mock_mux_instance = Mock()
mock_mux.return_value = mock_mux_instance
-
+
result = await manager.process_socket("ws", "running", {})
-
- mock_mux.assert_called_once_with(manager, "ws", "running")
+
+ mock_mux.assert_called_once_with(
+ manager, "ws", "running", auth=mock_auth,
+ )
assert result == mock_mux_instance
+ def test_dispatcher_manager_requires_auth(self):
+ """Constructing a DispatcherManager without an auth argument
+ must fail — a no-auth DispatcherManager would produce a
+ Mux without authentication, silently downgrading the socket
+ auth path."""
+ mock_backend = Mock()
+ mock_config_receiver = Mock()
+
+ with pytest.raises(ValueError, match="auth"):
+ DispatcherManager(mock_backend, mock_config_receiver, auth=None)
+
@pytest.mark.asyncio
async def test_process_flow_service(self):
"""Test process_flow_service method"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
manager.invoke_flow_service = AsyncMock(return_value="flow_result")
@@ -412,7 +436,7 @@ class TestDispatcherManager:
"""Test invoke_flow_service with existing dispatcher"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
# Add flow to the flows dictionary
manager.flows[("default", "test_flow")] = {"services": {"agent": {}}}
@@ -432,7 +456,7 @@ class TestDispatcherManager:
"""Test invoke_flow_service creates request-response dispatcher"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
# Setup test flow
manager.flows[("default", "test_flow")] = {
@@ -476,7 +500,7 @@ class TestDispatcherManager:
"""Test invoke_flow_service creates sender dispatcher"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
# Setup test flow
manager.flows[("default", "test_flow")] = {
@@ -516,7 +540,7 @@ class TestDispatcherManager:
"""Test invoke_flow_service with invalid flow"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
with pytest.raises(RuntimeError, match="Invalid flow"):
await manager.invoke_flow_service("data", "responder", "default", "invalid_flow", "agent")
@@ -526,7 +550,7 @@ class TestDispatcherManager:
"""Test invoke_flow_service with kind not supported by flow"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
# Setup test flow without agent interface
manager.flows[("default", "test_flow")] = {
@@ -543,7 +567,7 @@ class TestDispatcherManager:
"""Test invoke_flow_service with invalid kind"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
# Setup test flow with interface but unsupported kind
manager.flows[("default", "test_flow")] = {
@@ -570,7 +594,7 @@ class TestDispatcherManager:
"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
async def slow_start():
# Yield to the event loop so other coroutines get a chance to run,
@@ -606,7 +630,7 @@ class TestDispatcherManager:
"""
mock_backend = Mock()
mock_config_receiver = Mock()
- manager = DispatcherManager(mock_backend, mock_config_receiver)
+ manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock())
manager.flows[("default", "test_flow")] = {
"interfaces": {
diff --git a/tests/unit/test_gateway/test_dispatch_mux.py b/tests/unit/test_gateway/test_dispatch_mux.py
index a0bc9460..c1baa920 100644
--- a/tests/unit/test_gateway/test_dispatch_mux.py
+++ b/tests/unit/test_gateway/test_dispatch_mux.py
@@ -12,6 +12,19 @@ from trustgraph.gateway.dispatch.mux import Mux, MAX_QUEUE_SIZE
class TestMux:
"""Test cases for Mux class"""
+ def test_mux_requires_auth(self):
+ """Constructing a Mux without an ``auth`` argument must
+ fail. The Mux implements the first-frame auth protocol and
+ there is no no-auth mode — a no-auth Mux would silently
+ accept every frame without authenticating it."""
+ with pytest.raises(ValueError, match="auth"):
+ Mux(
+ dispatcher_manager=MagicMock(),
+ ws=MagicMock(),
+ running=MagicMock(),
+ auth=None,
+ )
+
def test_mux_initialization(self):
"""Test Mux initialization"""
mock_dispatcher_manager = MagicMock()
@@ -21,7 +34,8 @@ class TestMux:
mux = Mux(
dispatcher_manager=mock_dispatcher_manager,
ws=mock_ws,
- running=mock_running
+ running=mock_running,
+ auth=MagicMock(),
)
assert mux.dispatcher_manager == mock_dispatcher_manager
@@ -40,7 +54,8 @@ class TestMux:
mux = Mux(
dispatcher_manager=mock_dispatcher_manager,
ws=mock_ws,
- running=mock_running
+ running=mock_running,
+ auth=MagicMock(),
)
# Call destroy
@@ -61,7 +76,8 @@ class TestMux:
mux = Mux(
dispatcher_manager=mock_dispatcher_manager,
ws=None,
- running=mock_running
+ running=mock_running,
+ auth=MagicMock(),
)
# Call destroy
@@ -81,7 +97,8 @@ class TestMux:
mux = Mux(
dispatcher_manager=mock_dispatcher_manager,
ws=mock_ws,
- running=mock_running
+ running=mock_running,
+ auth=MagicMock(),
)
# Mock message with valid JSON
@@ -108,7 +125,8 @@ class TestMux:
mux = Mux(
dispatcher_manager=mock_dispatcher_manager,
ws=mock_ws,
- running=mock_running
+ running=mock_running,
+ auth=MagicMock(),
)
# Mock message without request field
@@ -137,7 +155,8 @@ class TestMux:
mux = Mux(
dispatcher_manager=mock_dispatcher_manager,
ws=mock_ws,
- running=mock_running
+ running=mock_running,
+ auth=MagicMock(),
)
# Mock message without id field
@@ -164,7 +183,8 @@ class TestMux:
mux = Mux(
dispatcher_manager=mock_dispatcher_manager,
ws=mock_ws,
- running=mock_running
+ running=mock_running,
+ auth=MagicMock(),
)
# Mock message with invalid JSON
diff --git a/tests/unit/test_gateway/test_endpoint_constant.py b/tests/unit/test_gateway/test_endpoint_constant.py
index f208c967..98588e55 100644
--- a/tests/unit/test_gateway/test_endpoint_constant.py
+++ b/tests/unit/test_gateway/test_endpoint_constant.py
@@ -13,29 +13,36 @@ class TestConstantEndpoint:
"""Test cases for ConstantEndpoint class"""
def test_constant_endpoint_initialization(self):
- """Test ConstantEndpoint initialization"""
+ """Construction records the configured capability on the
+ instance. The capability is a required argument — no
+ permissive default — and the test passes an explicit
+ value to demonstrate the contract."""
mock_auth = MagicMock()
mock_dispatcher = MagicMock()
-
+
endpoint = ConstantEndpoint(
endpoint_path="/api/test",
auth=mock_auth,
- dispatcher=mock_dispatcher
+ dispatcher=mock_dispatcher,
+ capability="config:read",
)
-
+
assert endpoint.path == "/api/test"
assert endpoint.auth == mock_auth
assert endpoint.dispatcher == mock_dispatcher
- assert endpoint.operation == "service"
+ assert endpoint.capability == "config:read"
@pytest.mark.asyncio
async def test_constant_endpoint_start_method(self):
"""Test ConstantEndpoint start method (should be no-op)"""
mock_auth = MagicMock()
mock_dispatcher = MagicMock()
-
- endpoint = ConstantEndpoint("/api/test", mock_auth, mock_dispatcher)
-
+
+ endpoint = ConstantEndpoint(
+ "/api/test", mock_auth, mock_dispatcher,
+ capability="config:read",
+ )
+
# start() should complete without error
await endpoint.start()
@@ -44,10 +51,13 @@ class TestConstantEndpoint:
mock_auth = MagicMock()
mock_dispatcher = MagicMock()
mock_app = MagicMock()
-
- endpoint = ConstantEndpoint("/api/test", mock_auth, mock_dispatcher)
+
+ endpoint = ConstantEndpoint(
+ "/api/test", mock_auth, mock_dispatcher,
+ capability="config:read",
+ )
endpoint.add_routes(mock_app)
-
+
# Verify add_routes was called with POST route
mock_app.add_routes.assert_called_once()
# The call should include web.post with the path and handler
diff --git a/tests/unit/test_gateway/test_endpoint_i18n.py b/tests/unit/test_gateway/test_endpoint_i18n.py
index ab693cdf..c2b51568 100644
--- a/tests/unit/test_gateway/test_endpoint_i18n.py
+++ b/tests/unit/test_gateway/test_endpoint_i18n.py
@@ -1,4 +1,12 @@
-"""Tests for Gateway i18n pack endpoint."""
+"""Tests for Gateway i18n pack endpoint.
+
+Production registers this endpoint with ``capability=PUBLIC``: the
+login UI needs to render its own i18n strings before any user has
+authenticated, so the endpoint is deliberately pre-auth. These
+tests exercise the PUBLIC configuration — that is the production
+contract. Behaviour of authenticated endpoints is covered by the
+IamAuth tests in ``test_auth.py``.
+"""
import json
from unittest.mock import MagicMock
@@ -7,6 +15,7 @@ import pytest
from aiohttp import web
from trustgraph.gateway.endpoint.i18n import I18nPackEndpoint
+from trustgraph.gateway.capabilities import PUBLIC
class TestI18nPackEndpoint:
@@ -17,23 +26,28 @@ class TestI18nPackEndpoint:
endpoint = I18nPackEndpoint(
endpoint_path="/api/v1/i18n/packs/{lang}",
auth=mock_auth,
+ capability=PUBLIC,
)
assert endpoint.path == "/api/v1/i18n/packs/{lang}"
assert endpoint.auth == mock_auth
- assert endpoint.operation == "service"
+ assert endpoint.capability == PUBLIC
@pytest.mark.asyncio
async def test_i18n_endpoint_start_method(self):
mock_auth = MagicMock()
- endpoint = I18nPackEndpoint("/api/v1/i18n/packs/{lang}", mock_auth)
+ endpoint = I18nPackEndpoint(
+ "/api/v1/i18n/packs/{lang}", mock_auth, capability=PUBLIC,
+ )
await endpoint.start()
def test_add_routes_registers_get_handler(self):
mock_auth = MagicMock()
mock_app = MagicMock()
- endpoint = I18nPackEndpoint("/api/v1/i18n/packs/{lang}", mock_auth)
+ endpoint = I18nPackEndpoint(
+ "/api/v1/i18n/packs/{lang}", mock_auth, capability=PUBLIC,
+ )
endpoint.add_routes(mock_app)
mock_app.add_routes.assert_called_once()
@@ -41,35 +55,55 @@ class TestI18nPackEndpoint:
assert len(call_args) == 1
@pytest.mark.asyncio
- async def test_handle_unauthorized_on_invalid_auth_scheme(self):
+ async def test_handle_returns_pack_without_authenticating(self):
+ """The PUBLIC endpoint serves the language pack without
+ invoking the auth handler at all — pre-login UI must be
+ reachable. The test uses an auth mock that raises if
+ touched, so any auth attempt by the endpoint is caught."""
mock_auth = MagicMock()
- mock_auth.permitted.return_value = True
- endpoint = I18nPackEndpoint("/api/v1/i18n/packs/{lang}", mock_auth)
+ def _should_not_be_called(*args, **kwargs):
+ raise AssertionError(
+ "PUBLIC endpoint must not invoke auth.authenticate"
+ )
+ mock_auth.authenticate = _should_not_be_called
+
+ endpoint = I18nPackEndpoint(
+ "/api/v1/i18n/packs/{lang}", mock_auth, capability=PUBLIC,
+ )
request = MagicMock()
request.path = "/api/v1/i18n/packs/en"
+ # A caller-supplied Authorization header of any form should
+ # be ignored — PUBLIC means we don't look at it.
request.headers = {"Authorization": "Token abc"}
request.match_info = {"lang": "en"}
- resp = await endpoint.handle(request)
- assert isinstance(resp, web.HTTPUnauthorized)
-
- @pytest.mark.asyncio
- async def test_handle_returns_pack_when_permitted(self):
- mock_auth = MagicMock()
- mock_auth.permitted.return_value = True
-
- endpoint = I18nPackEndpoint("/api/v1/i18n/packs/{lang}", mock_auth)
-
- request = MagicMock()
- request.path = "/api/v1/i18n/packs/en"
- request.headers = {}
- request.match_info = {"lang": "en"}
-
resp = await endpoint.handle(request)
assert resp.status == 200
payload = json.loads(resp.body.decode("utf-8"))
assert isinstance(payload, dict)
assert "cli.verify_system_status.title" in payload
+
+ @pytest.mark.asyncio
+ async def test_handle_rejects_path_traversal(self):
+ """The ``lang`` path parameter is reflected through to the
+ filesystem-backed pack loader. The endpoint contains an
+ explicit defense against ``/`` and ``..`` in the value; this
+ test pins that defense in place."""
+ mock_auth = MagicMock()
+ endpoint = I18nPackEndpoint(
+ "/api/v1/i18n/packs/{lang}", mock_auth, capability=PUBLIC,
+ )
+
+ for bad in ("../../etc/passwd", "en/../fr", "a/b"):
+ request = MagicMock()
+ request.path = f"/api/v1/i18n/packs/{bad}"
+ request.headers = {}
+ request.match_info = {"lang": bad}
+
+ resp = await endpoint.handle(request)
+ assert isinstance(resp, web.HTTPBadRequest), (
+ f"path-traversal defense did not reject lang={bad!r}"
+ )
diff --git a/tests/unit/test_gateway/test_endpoint_manager.py b/tests/unit/test_gateway/test_endpoint_manager.py
index 4766f8d7..8f659b71 100644
--- a/tests/unit/test_gateway/test_endpoint_manager.py
+++ b/tests/unit/test_gateway/test_endpoint_manager.py
@@ -12,30 +12,24 @@ class TestEndpointManager:
"""Test cases for EndpointManager class"""
def test_endpoint_manager_initialization(self):
- """Test EndpointManager initialization creates all endpoints"""
+ """EndpointManager wires up the full endpoint set and
+ records dispatcher_manager / timeout on the instance."""
mock_dispatcher_manager = MagicMock()
mock_auth = MagicMock()
-
- # Mock dispatcher methods
- mock_dispatcher_manager.dispatch_global_service.return_value = MagicMock()
- mock_dispatcher_manager.dispatch_socket.return_value = MagicMock()
- mock_dispatcher_manager.dispatch_flow_service.return_value = MagicMock()
- mock_dispatcher_manager.dispatch_flow_import.return_value = MagicMock()
- mock_dispatcher_manager.dispatch_flow_export.return_value = MagicMock()
- mock_dispatcher_manager.dispatch_core_import.return_value = MagicMock()
- mock_dispatcher_manager.dispatch_core_export.return_value = MagicMock()
-
+
+ # The dispatcher_manager exposes a small set of factory
+ # methods — MagicMock auto-creates them, returning fresh
+ # MagicMocks on each call.
manager = EndpointManager(
dispatcher_manager=mock_dispatcher_manager,
auth=mock_auth,
prometheus_url="http://prometheus:9090",
- timeout=300
+ timeout=300,
)
-
+
assert manager.dispatcher_manager == mock_dispatcher_manager
assert manager.timeout == 300
- assert manager.services == {}
- assert len(manager.endpoints) > 0 # Should have multiple endpoints
+ assert len(manager.endpoints) > 0
def test_endpoint_manager_with_default_timeout(self):
"""Test EndpointManager with default timeout value"""
@@ -79,9 +73,17 @@ class TestEndpointManager:
prometheus_url="http://test:9090"
)
- # Verify all dispatcher methods were called during initialization
+ # Each dispatcher factory is invoked once per endpoint that
+ # needs a dedicated wire. dispatch_auth_iam is shared by
+ # two endpoints — AuthEndpoints (login / bootstrap /
+ # change-password) and IamEndpoint (registry-driven
+ # /api/v1/iam) — so it's expected to be called twice.
+ # Both forwarders pin the dispatcher to kind=iam and reuse
+ # the same factory; they're distinct from
+ # dispatch_global_service (the generic /api/v1/{kind} route).
mock_dispatcher_manager.dispatch_global_service.assert_called_once()
- mock_dispatcher_manager.dispatch_socket.assert_called() # Called twice
+ assert mock_dispatcher_manager.dispatch_auth_iam.call_count == 2
+ mock_dispatcher_manager.dispatch_socket.assert_called_once()
mock_dispatcher_manager.dispatch_flow_service.assert_called_once()
mock_dispatcher_manager.dispatch_flow_import.assert_called_once()
mock_dispatcher_manager.dispatch_flow_export.assert_called_once()
diff --git a/tests/unit/test_gateway/test_endpoint_metrics.py b/tests/unit/test_gateway/test_endpoint_metrics.py
index bacf551d..6d911bbd 100644
--- a/tests/unit/test_gateway/test_endpoint_metrics.py
+++ b/tests/unit/test_gateway/test_endpoint_metrics.py
@@ -12,31 +12,35 @@ class TestMetricsEndpoint:
"""Test cases for MetricsEndpoint class"""
def test_metrics_endpoint_initialization(self):
- """Test MetricsEndpoint initialization"""
+ """Construction records the configured capability on the
+ instance. In production MetricsEndpoint is gated by
+ 'metrics:read' so that's the natural value to pass."""
mock_auth = MagicMock()
-
+
endpoint = MetricsEndpoint(
prometheus_url="http://prometheus:9090",
endpoint_path="/metrics",
- auth=mock_auth
+ auth=mock_auth,
+ capability="metrics:read",
)
-
+
assert endpoint.prometheus_url == "http://prometheus:9090"
assert endpoint.path == "/metrics"
assert endpoint.auth == mock_auth
- assert endpoint.operation == "service"
+ assert endpoint.capability == "metrics:read"
@pytest.mark.asyncio
async def test_metrics_endpoint_start_method(self):
"""Test MetricsEndpoint start method (should be no-op)"""
mock_auth = MagicMock()
-
+
endpoint = MetricsEndpoint(
prometheus_url="http://localhost:9090",
endpoint_path="/metrics",
- auth=mock_auth
+ auth=mock_auth,
+ capability="metrics:read",
)
-
+
# start() should complete without error
await endpoint.start()
@@ -44,15 +48,16 @@ class TestMetricsEndpoint:
"""Test add_routes method registers GET route with wildcard path"""
mock_auth = MagicMock()
mock_app = MagicMock()
-
+
endpoint = MetricsEndpoint(
prometheus_url="http://prometheus:9090",
endpoint_path="/metrics",
- auth=mock_auth
+ auth=mock_auth,
+ capability="metrics:read",
)
-
+
endpoint.add_routes(mock_app)
-
+
# Verify add_routes was called with GET route
mock_app.add_routes.assert_called_once()
# The call should include web.get with wildcard path pattern
diff --git a/tests/unit/test_gateway/test_endpoint_socket.py b/tests/unit/test_gateway/test_endpoint_socket.py
index 83eb38c2..189bc32b 100644
--- a/tests/unit/test_gateway/test_endpoint_socket.py
+++ b/tests/unit/test_gateway/test_endpoint_socket.py
@@ -1,5 +1,12 @@
"""
-Tests for Gateway Socket Endpoint
+Tests for Gateway Socket Endpoint.
+
+In production the only SocketEndpoint registered with HTTP-layer
+auth is ``/api/v1/socket`` using ``capability=AUTHENTICATED`` with
+``in_band_auth=True`` (first-frame auth over the websocket frames,
+not at the handshake). The tests below use AUTHENTICATED as the
+representative capability; construction / worker / listener
+behaviour is independent of which capability is configured.
"""
import pytest
@@ -7,41 +14,47 @@ from unittest.mock import MagicMock, AsyncMock
from aiohttp import WSMsgType
from trustgraph.gateway.endpoint.socket import SocketEndpoint
+from trustgraph.gateway.capabilities import AUTHENTICATED
class TestSocketEndpoint:
"""Test cases for SocketEndpoint class"""
def test_socket_endpoint_initialization(self):
- """Test SocketEndpoint initialization"""
+ """Construction records the configured capability on the
+ instance. No permissive default is applied."""
mock_auth = MagicMock()
mock_dispatcher = MagicMock()
-
+
endpoint = SocketEndpoint(
endpoint_path="/api/socket",
auth=mock_auth,
- dispatcher=mock_dispatcher
+ dispatcher=mock_dispatcher,
+ capability=AUTHENTICATED,
)
-
+
assert endpoint.path == "/api/socket"
assert endpoint.auth == mock_auth
assert endpoint.dispatcher == mock_dispatcher
- assert endpoint.operation == "socket"
+ assert endpoint.capability == AUTHENTICATED
@pytest.mark.asyncio
async def test_worker_method(self):
"""Test SocketEndpoint worker method"""
mock_auth = MagicMock()
mock_dispatcher = AsyncMock()
-
- endpoint = SocketEndpoint("/api/socket", mock_auth, mock_dispatcher)
-
+
+ endpoint = SocketEndpoint(
+ "/api/socket", mock_auth, mock_dispatcher,
+ capability=AUTHENTICATED,
+ )
+
mock_ws = MagicMock()
mock_running = MagicMock()
-
+
# Call worker method
await endpoint.worker(mock_ws, mock_dispatcher, mock_running)
-
+
# Verify dispatcher.run was called
mock_dispatcher.run.assert_called_once()
@@ -50,8 +63,11 @@ class TestSocketEndpoint:
"""Test SocketEndpoint listener method with text message"""
mock_auth = MagicMock()
mock_dispatcher = AsyncMock()
-
- endpoint = SocketEndpoint("/api/socket", mock_auth, mock_dispatcher)
+
+ endpoint = SocketEndpoint(
+ "/api/socket", mock_auth, mock_dispatcher,
+ capability=AUTHENTICATED,
+ )
# Mock websocket with text message
mock_msg = MagicMock()
@@ -80,8 +96,11 @@ class TestSocketEndpoint:
"""Test SocketEndpoint listener method with binary message"""
mock_auth = MagicMock()
mock_dispatcher = AsyncMock()
-
- endpoint = SocketEndpoint("/api/socket", mock_auth, mock_dispatcher)
+
+ endpoint = SocketEndpoint(
+ "/api/socket", mock_auth, mock_dispatcher,
+ capability=AUTHENTICATED,
+ )
# Mock websocket with binary message
mock_msg = MagicMock()
@@ -110,8 +129,11 @@ class TestSocketEndpoint:
"""Test SocketEndpoint listener method with close message"""
mock_auth = MagicMock()
mock_dispatcher = AsyncMock()
-
- endpoint = SocketEndpoint("/api/socket", mock_auth, mock_dispatcher)
+
+ endpoint = SocketEndpoint(
+ "/api/socket", mock_auth, mock_dispatcher,
+ capability=AUTHENTICATED,
+ )
# Mock websocket with close message
mock_msg = MagicMock()
diff --git a/tests/unit/test_gateway/test_endpoint_stream.py b/tests/unit/test_gateway/test_endpoint_stream.py
index b99946c8..a3b49465 100644
--- a/tests/unit/test_gateway/test_endpoint_stream.py
+++ b/tests/unit/test_gateway/test_endpoint_stream.py
@@ -12,48 +12,57 @@ class TestStreamEndpoint:
"""Test cases for StreamEndpoint class"""
def test_stream_endpoint_initialization_with_post(self):
- """Test StreamEndpoint initialization with POST method"""
+ """Construction records the configured capability on the
+ instance. StreamEndpoint is used in production for the
+ core-import / core-export / document-stream routes; a
+ document-write capability is a realistic value for a POST
+ stream (e.g. core-import)."""
mock_auth = MagicMock()
mock_dispatcher = MagicMock()
-
+
endpoint = StreamEndpoint(
endpoint_path="/api/stream",
auth=mock_auth,
dispatcher=mock_dispatcher,
- method="POST"
+ capability="documents:write",
+ method="POST",
)
-
+
assert endpoint.path == "/api/stream"
assert endpoint.auth == mock_auth
assert endpoint.dispatcher == mock_dispatcher
- assert endpoint.operation == "service"
+ assert endpoint.capability == "documents:write"
assert endpoint.method == "POST"
def test_stream_endpoint_initialization_with_get(self):
- """Test StreamEndpoint initialization with GET method"""
+ """GET stream — export-style endpoint, read capability."""
mock_auth = MagicMock()
mock_dispatcher = MagicMock()
-
+
endpoint = StreamEndpoint(
endpoint_path="/api/stream",
auth=mock_auth,
dispatcher=mock_dispatcher,
- method="GET"
+ capability="documents:read",
+ method="GET",
)
-
+
assert endpoint.method == "GET"
def test_stream_endpoint_initialization_default_method(self):
- """Test StreamEndpoint initialization with default POST method"""
+ """Test StreamEndpoint initialization with default POST method.
+ The method default is cosmetic; the capability is not
+ defaulted — it is always required."""
mock_auth = MagicMock()
mock_dispatcher = MagicMock()
-
+
endpoint = StreamEndpoint(
endpoint_path="/api/stream",
auth=mock_auth,
- dispatcher=mock_dispatcher
+ dispatcher=mock_dispatcher,
+ capability="documents:write",
)
-
+
assert endpoint.method == "POST" # Default value
@pytest.mark.asyncio
@@ -61,9 +70,12 @@ class TestStreamEndpoint:
"""Test StreamEndpoint start method (should be no-op)"""
mock_auth = MagicMock()
mock_dispatcher = MagicMock()
-
- endpoint = StreamEndpoint("/api/stream", mock_auth, mock_dispatcher)
-
+
+ endpoint = StreamEndpoint(
+ "/api/stream", mock_auth, mock_dispatcher,
+ capability="documents:write",
+ )
+
# start() should complete without error
await endpoint.start()
@@ -72,16 +84,17 @@ class TestStreamEndpoint:
mock_auth = MagicMock()
mock_dispatcher = MagicMock()
mock_app = MagicMock()
-
+
endpoint = StreamEndpoint(
endpoint_path="/api/stream",
auth=mock_auth,
dispatcher=mock_dispatcher,
- method="POST"
+ capability="documents:write",
+ method="POST",
)
-
+
endpoint.add_routes(mock_app)
-
+
# Verify add_routes was called with POST route
mock_app.add_routes.assert_called_once()
call_args = mock_app.add_routes.call_args[0][0]
@@ -92,16 +105,17 @@ class TestStreamEndpoint:
mock_auth = MagicMock()
mock_dispatcher = MagicMock()
mock_app = MagicMock()
-
+
endpoint = StreamEndpoint(
endpoint_path="/api/stream",
auth=mock_auth,
dispatcher=mock_dispatcher,
- method="GET"
+ capability="documents:read",
+ method="GET",
)
-
+
endpoint.add_routes(mock_app)
-
+
# Verify add_routes was called with GET route
mock_app.add_routes.assert_called_once()
call_args = mock_app.add_routes.call_args[0][0]
@@ -112,13 +126,14 @@ class TestStreamEndpoint:
mock_auth = MagicMock()
mock_dispatcher = MagicMock()
mock_app = MagicMock()
-
+
endpoint = StreamEndpoint(
endpoint_path="/api/stream",
auth=mock_auth,
dispatcher=mock_dispatcher,
- method="INVALID"
+ capability="documents:write",
+ method="INVALID",
)
-
+
with pytest.raises(RuntimeError, match="Bad method"):
endpoint.add_routes(mock_app)
\ No newline at end of file
diff --git a/tests/unit/test_gateway/test_endpoint_variable.py b/tests/unit/test_gateway/test_endpoint_variable.py
index ffaf4e9a..1cdc8f9f 100644
--- a/tests/unit/test_gateway/test_endpoint_variable.py
+++ b/tests/unit/test_gateway/test_endpoint_variable.py
@@ -12,29 +12,36 @@ class TestVariableEndpoint:
"""Test cases for VariableEndpoint class"""
def test_variable_endpoint_initialization(self):
- """Test VariableEndpoint initialization"""
+ """Construction records the configured capability on the
+ instance. VariableEndpoint is used in production for the
+ /api/v1/{kind} admin-scoped global service routes, so a
+ write-side capability is a realistic value for the test."""
mock_auth = MagicMock()
mock_dispatcher = MagicMock()
-
+
endpoint = VariableEndpoint(
endpoint_path="/api/variable",
auth=mock_auth,
- dispatcher=mock_dispatcher
+ dispatcher=mock_dispatcher,
+ capability="config:write",
)
-
+
assert endpoint.path == "/api/variable"
assert endpoint.auth == mock_auth
assert endpoint.dispatcher == mock_dispatcher
- assert endpoint.operation == "service"
+ assert endpoint.capability == "config:write"
@pytest.mark.asyncio
async def test_variable_endpoint_start_method(self):
"""Test VariableEndpoint start method (should be no-op)"""
mock_auth = MagicMock()
mock_dispatcher = MagicMock()
-
- endpoint = VariableEndpoint("/api/var", mock_auth, mock_dispatcher)
-
+
+ endpoint = VariableEndpoint(
+ "/api/var", mock_auth, mock_dispatcher,
+ capability="config:write",
+ )
+
# start() should complete without error
await endpoint.start()
@@ -43,10 +50,13 @@ class TestVariableEndpoint:
mock_auth = MagicMock()
mock_dispatcher = MagicMock()
mock_app = MagicMock()
-
- endpoint = VariableEndpoint("/api/variable", mock_auth, mock_dispatcher)
+
+ endpoint = VariableEndpoint(
+ "/api/variable", mock_auth, mock_dispatcher,
+ capability="config:write",
+ )
endpoint.add_routes(mock_app)
-
+
# Verify add_routes was called with POST route
mock_app.add_routes.assert_called_once()
call_args = mock_app.add_routes.call_args[0][0]
diff --git a/tests/unit/test_gateway/test_service.py b/tests/unit/test_gateway/test_service.py
index 71428db4..107e6819 100644
--- a/tests/unit/test_gateway/test_service.py
+++ b/tests/unit/test_gateway/test_service.py
@@ -1,355 +1,179 @@
"""
-Tests for Gateway Service API
+Tests for gateway/service.py — the Api class that wires together
+the pub/sub backend, IAM auth, config receiver, dispatcher manager,
+and endpoint manager.
+
+The legacy ``GATEWAY_SECRET`` / ``default_api_token`` / allow-all
+surface is gone, so the tests here focus on the Api's construction
+and composition rather than the removed auth behaviour. IamAuth's
+own behaviour is covered in test_auth.py.
"""
import pytest
-import asyncio
-from unittest.mock import Mock, patch, MagicMock, AsyncMock
+from unittest.mock import AsyncMock, Mock, patch
from aiohttp import web
-import pulsar
-from trustgraph.gateway.service import Api, run, default_pulsar_host, default_prometheus_url, default_timeout, default_port, default_api_token
-
-# Tests for Gateway Service API
+from trustgraph.gateway.service import (
+ Api,
+ default_pulsar_host, default_prometheus_url,
+ default_timeout, default_port,
+)
+from trustgraph.gateway.auth import IamAuth
-class TestApi:
- """Test cases for Api class"""
-
+# -- constants -------------------------------------------------------------
- def test_api_initialization_with_defaults(self):
- """Test Api initialization with default values"""
- with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
- mock_backend = Mock()
- mock_get_pubsub.return_value = mock_backend
- api = Api()
+class TestDefaults:
- assert api.port == default_port
- assert api.timeout == default_timeout
- assert api.pulsar_host == default_pulsar_host
- assert api.pulsar_api_key is None
- assert api.prometheus_url == default_prometheus_url + "/"
- assert api.auth.allow_all is True
+ def test_exports_default_constants(self):
+ # These are consumed by CLIs / tests / docs. Sanity-check
+ # that they're the expected shape.
+ assert default_port == 8088
+ assert default_timeout == 600
+ assert default_pulsar_host.startswith("pulsar://")
+ assert default_prometheus_url.startswith("http")
- # Verify get_pubsub was called
- mock_get_pubsub.assert_called_once()
- def test_api_initialization_with_custom_config(self):
- """Test Api initialization with custom configuration"""
+# -- Api construction ------------------------------------------------------
+
+
+@pytest.fixture
+def mock_backend():
+ return Mock()
+
+
+@pytest.fixture
+def api(mock_backend):
+ with patch(
+ "trustgraph.gateway.service.get_pubsub",
+ return_value=mock_backend,
+ ):
+ yield Api()
+
+
+class TestApiConstruction:
+
+ def test_defaults(self, api):
+ assert api.port == default_port
+ assert api.timeout == default_timeout
+ assert api.pulsar_host == default_pulsar_host
+ assert api.pulsar_api_key is None
+ # prometheus_url gets normalised with a trailing slash
+ assert api.prometheus_url == default_prometheus_url + "/"
+
+ def test_auth_is_iam_backed(self, api):
+ # Any Api always gets an IamAuth. There is no "no auth" mode
+ # (GATEWAY_SECRET / allow_all has been removed — see IAM spec).
+ assert isinstance(api.auth, IamAuth)
+
+ def test_components_wired(self, api):
+ assert api.config_receiver is not None
+ assert api.dispatcher_manager is not None
+ assert api.endpoint_manager is not None
+
+ def test_dispatcher_manager_has_auth(self, api):
+ # The Mux uses this handle for first-frame socket auth.
+ assert api.dispatcher_manager.auth is api.auth
+
+ def test_custom_config(self, mock_backend):
config = {
"port": 9000,
"timeout": 300,
"pulsar_host": "pulsar://custom-host:6650",
- "pulsar_api_key": "test-api-key",
- "pulsar_listener": "custom-listener",
+ "pulsar_api_key": "custom-key",
"prometheus_url": "http://custom-prometheus:9090",
- "api_token": "secret-token"
}
+ with patch(
+ "trustgraph.gateway.service.get_pubsub",
+ return_value=mock_backend,
+ ):
+ a = Api(**config)
- with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
- mock_backend = Mock()
- mock_get_pubsub.return_value = mock_backend
+ assert a.port == 9000
+ assert a.timeout == 300
+ assert a.pulsar_host == "pulsar://custom-host:6650"
+ assert a.pulsar_api_key == "custom-key"
+ # Trailing slash added.
+ assert a.prometheus_url == "http://custom-prometheus:9090/"
- api = Api(**config)
+ def test_prometheus_url_already_has_trailing_slash(self, mock_backend):
+ with patch(
+ "trustgraph.gateway.service.get_pubsub",
+ return_value=mock_backend,
+ ):
+ a = Api(prometheus_url="http://p:9090/")
+ assert a.prometheus_url == "http://p:9090/"
- assert api.port == 9000
- assert api.timeout == 300
- assert api.pulsar_host == "pulsar://custom-host:6650"
- assert api.pulsar_api_key == "test-api-key"
- assert api.prometheus_url == "http://custom-prometheus:9090/"
- assert api.auth.token == "secret-token"
- assert api.auth.allow_all is False
+ def test_queue_overrides_parsed_for_config(self, mock_backend):
+ with patch(
+ "trustgraph.gateway.service.get_pubsub",
+ return_value=mock_backend,
+ ):
+ a = Api(
+ config_request_queue="alt-config-req",
+ config_response_queue="alt-config-resp",
+ )
+ overrides = a.dispatcher_manager.queue_overrides
+ assert overrides.get("config", {}).get("request") == "alt-config-req"
+ assert overrides.get("config", {}).get("response") == "alt-config-resp"
- # Verify get_pubsub was called with config
- mock_get_pubsub.assert_called_once_with(**config)
- def test_api_initialization_with_pulsar_api_key(self):
- """Test Api initialization with Pulsar API key authentication"""
- with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
- mock_get_pubsub.return_value = Mock()
+# -- app_factory -----------------------------------------------------------
- api = Api(pulsar_api_key="test-key")
- # Verify api key was stored
- assert api.pulsar_api_key == "test-key"
- mock_get_pubsub.assert_called_once()
-
- def test_api_initialization_prometheus_url_normalization(self):
- """Test that prometheus_url gets normalized with trailing slash"""
- with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
- mock_get_pubsub.return_value = Mock()
-
- # Test URL without trailing slash
- api = Api(prometheus_url="http://prometheus:9090")
- assert api.prometheus_url == "http://prometheus:9090/"
-
- # Test URL with trailing slash
- api = Api(prometheus_url="http://prometheus:9090/")
- assert api.prometheus_url == "http://prometheus:9090/"
-
- def test_api_initialization_empty_api_token_means_no_auth(self):
- """Test that empty API token results in allow_all authentication"""
- with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
- mock_get_pubsub.return_value = Mock()
-
- api = Api(api_token="")
- assert api.auth.allow_all is True
-
- def test_api_initialization_none_api_token_means_no_auth(self):
- """Test that None API token results in allow_all authentication"""
- with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
- mock_get_pubsub.return_value = Mock()
-
- api = Api(api_token=None)
- assert api.auth.allow_all is True
+class TestAppFactory:
@pytest.mark.asyncio
- async def test_app_factory_creates_application(self):
- """Test that app_factory creates aiohttp application"""
- with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
- mock_get_pubsub.return_value = Mock()
-
- api = Api()
-
- # Mock the dependencies
- api.config_receiver = Mock()
- api.config_receiver.start = AsyncMock()
- api.endpoint_manager = Mock()
- api.endpoint_manager.add_routes = Mock()
- api.endpoint_manager.start = AsyncMock()
-
- app = await api.app_factory()
-
- assert isinstance(app, web.Application)
- assert app._client_max_size == 256 * 1024 * 1024
-
- # Verify that config receiver was started
- api.config_receiver.start.assert_called_once()
-
- # Verify that endpoint manager was configured
- api.endpoint_manager.add_routes.assert_called_once_with(app)
- api.endpoint_manager.start.assert_called_once()
+ async def test_creates_aiohttp_app(self, api):
+ # Stub out the long-tail dependencies that reach out to IAM /
+ # pub/sub so we can exercise the factory in isolation.
+ api.auth.start = AsyncMock()
+ api.config_receiver = Mock()
+ api.config_receiver.start = AsyncMock()
+ api.endpoint_manager = Mock()
+ api.endpoint_manager.add_routes = Mock()
+ api.endpoint_manager.start = AsyncMock()
+ api.endpoints = []
+
+ app = await api.app_factory()
+
+ assert isinstance(app, web.Application)
+ assert app._client_max_size == 256 * 1024 * 1024
+ api.auth.start.assert_called_once()
+ api.config_receiver.start.assert_called_once()
+ api.endpoint_manager.add_routes.assert_called_once_with(app)
+ api.endpoint_manager.start.assert_called_once()
@pytest.mark.asyncio
- async def test_app_factory_with_custom_endpoints(self):
- """Test app_factory with custom endpoints"""
- with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
- mock_get_pubsub.return_value = Mock()
-
- api = Api()
-
- # Mock custom endpoints
- mock_endpoint1 = Mock()
- mock_endpoint1.add_routes = Mock()
- mock_endpoint1.start = AsyncMock()
-
- mock_endpoint2 = Mock()
- mock_endpoint2.add_routes = Mock()
- mock_endpoint2.start = AsyncMock()
-
- api.endpoints = [mock_endpoint1, mock_endpoint2]
-
- # Mock the dependencies
- api.config_receiver = Mock()
- api.config_receiver.start = AsyncMock()
- api.endpoint_manager = Mock()
- api.endpoint_manager.add_routes = Mock()
- api.endpoint_manager.start = AsyncMock()
-
- app = await api.app_factory()
-
- # Verify custom endpoints were configured
- mock_endpoint1.add_routes.assert_called_once_with(app)
- mock_endpoint1.start.assert_called_once()
- mock_endpoint2.add_routes.assert_called_once_with(app)
- mock_endpoint2.start.assert_called_once()
+ async def test_auth_start_runs_before_accepting_traffic(self, api):
+ """``auth.start()`` fetches the IAM signing key, and must
+ complete (or time out) before the gateway begins accepting
+ requests. It's the first await in app_factory."""
+ order = []
- def test_run_method_calls_web_run_app(self):
- """Test that run method calls web.run_app"""
- with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub, \
- patch('aiohttp.web.run_app') as mock_run_app:
- mock_get_pubsub.return_value = Mock()
+ # AsyncMock.side_effect expects a sync callable (its return
+ # value becomes the coroutine's return); a plain list.append
+ # avoids the "coroutine was never awaited" trap of an async
+ # side_effect.
+ api.auth.start = AsyncMock(
+ side_effect=lambda: order.append("auth"),
+ )
+ api.config_receiver = Mock()
+ api.config_receiver.start = AsyncMock(
+ side_effect=lambda: order.append("config"),
+ )
+ api.endpoint_manager = Mock()
+ api.endpoint_manager.add_routes = Mock()
+ api.endpoint_manager.start = AsyncMock(
+ side_effect=lambda: order.append("endpoints"),
+ )
+ api.endpoints = []
- # Api.run() passes self.app_factory() — a coroutine — to
- # web.run_app, which would normally consume it inside its own
- # event loop. Since we mock run_app, close the coroutine here
- # so it doesn't leak as an "unawaited coroutine" RuntimeWarning.
- def _consume_coro(coro, **kwargs):
- coro.close()
- mock_run_app.side_effect = _consume_coro
+ await api.app_factory()
- api = Api(port=8080)
- api.run()
-
- # Verify run_app was called once with the correct port
- mock_run_app.assert_called_once()
- args, kwargs = mock_run_app.call_args
- assert len(args) == 1 # Should have one positional arg (the coroutine)
- assert kwargs == {'port': 8080} # Should have port keyword arg
-
- def test_api_components_initialization(self):
- """Test that all API components are properly initialized"""
- with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub:
- mock_get_pubsub.return_value = Mock()
-
- api = Api()
-
- # Verify all components are initialized
- assert api.config_receiver is not None
- assert api.dispatcher_manager is not None
- assert api.endpoint_manager is not None
- assert api.endpoints == []
-
- # Verify component relationships
- assert api.dispatcher_manager.backend == api.pubsub_backend
- assert api.dispatcher_manager.config_receiver == api.config_receiver
- assert api.endpoint_manager.dispatcher_manager == api.dispatcher_manager
- # EndpointManager doesn't store auth directly, it passes it to individual endpoints
-
-
-class TestRunFunction:
- """Test cases for the run() function"""
-
- def test_run_function_with_metrics_enabled(self):
- """Test run function with metrics enabled"""
- import warnings
- # Suppress the specific async warning with a broader pattern
- warnings.filterwarnings("ignore", message=".*Api.app_factory.*was never awaited", category=RuntimeWarning)
-
- with patch('argparse.ArgumentParser.parse_args') as mock_parse_args, \
- patch('trustgraph.gateway.service.start_http_server') as mock_start_http_server:
-
- # Mock command line arguments
- mock_args = Mock()
- mock_args.metrics = True
- mock_args.metrics_port = 8000
- mock_parse_args.return_value = mock_args
-
- # Create a simple mock instance without any async methods
- mock_api_instance = Mock()
- mock_api_instance.run = Mock()
-
- # Create a mock Api class without importing the real one
- mock_api = Mock(return_value=mock_api_instance)
-
- # Patch using context manager to avoid importing the real Api class
- with patch('trustgraph.gateway.service.Api', mock_api):
- # Mock vars() to return a dict
- with patch('builtins.vars') as mock_vars:
- mock_vars.return_value = {
- 'metrics': True,
- 'metrics_port': 8000,
- 'pulsar_host': default_pulsar_host,
- 'timeout': default_timeout
- }
-
- run()
-
- # Verify metrics server was started
- mock_start_http_server.assert_called_once_with(8000)
-
- # Verify Api was created and run was called
- mock_api.assert_called_once()
- mock_api_instance.run.assert_called_once()
-
- @patch('trustgraph.gateway.service.start_http_server')
- @patch('argparse.ArgumentParser.parse_args')
- def test_run_function_with_metrics_disabled(self, mock_parse_args, mock_start_http_server):
- """Test run function with metrics disabled"""
- # Mock command line arguments
- mock_args = Mock()
- mock_args.metrics = False
- mock_parse_args.return_value = mock_args
-
- # Create a simple mock instance without any async methods
- mock_api_instance = Mock()
- mock_api_instance.run = Mock()
-
- # Patch the Api class inside the test without using decorators
- with patch('trustgraph.gateway.service.Api') as mock_api:
- mock_api.return_value = mock_api_instance
-
- # Mock vars() to return a dict
- with patch('builtins.vars') as mock_vars:
- mock_vars.return_value = {
- 'metrics': False,
- 'metrics_port': 8000,
- 'pulsar_host': default_pulsar_host,
- 'timeout': default_timeout
- }
-
- run()
-
- # Verify metrics server was NOT started
- mock_start_http_server.assert_not_called()
-
- # Verify Api was created and run was called
- mock_api.assert_called_once()
- mock_api_instance.run.assert_called_once()
-
- @patch('argparse.ArgumentParser.parse_args')
- def test_run_function_argument_parsing(self, mock_parse_args):
- """Test that run function properly parses command line arguments"""
- # Mock command line arguments
- mock_args = Mock()
- mock_args.metrics = False
- mock_parse_args.return_value = mock_args
-
- # Create a simple mock instance without any async methods
- mock_api_instance = Mock()
- mock_api_instance.run = Mock()
-
- # Mock vars() to return a dict with all expected arguments
- expected_args = {
- 'pulsar_host': 'pulsar://test:6650',
- 'pulsar_api_key': 'test-key',
- 'pulsar_listener': 'test-listener',
- 'prometheus_url': 'http://test-prometheus:9090',
- 'port': 9000,
- 'timeout': 300,
- 'api_token': 'secret',
- 'log_level': 'INFO',
- 'metrics': False,
- 'metrics_port': 8001
- }
-
- # Patch the Api class inside the test without using decorators
- with patch('trustgraph.gateway.service.Api') as mock_api:
- mock_api.return_value = mock_api_instance
-
- with patch('builtins.vars') as mock_vars:
- mock_vars.return_value = expected_args
-
- run()
-
- # Verify Api was created with the parsed arguments
- mock_api.assert_called_once_with(**expected_args)
- mock_api_instance.run.assert_called_once()
-
- def test_run_function_creates_argument_parser(self):
- """Test that run function creates argument parser with correct arguments"""
- with patch('argparse.ArgumentParser') as mock_parser_class:
- mock_parser = Mock()
- mock_parser_class.return_value = mock_parser
- mock_parser.parse_args.return_value = Mock(metrics=False)
-
- with patch('trustgraph.gateway.service.Api') as mock_api, \
- patch('builtins.vars') as mock_vars:
- mock_vars.return_value = {'metrics': False}
- mock_api.return_value = Mock()
-
- run()
-
- # Verify ArgumentParser was created
- mock_parser_class.assert_called_once()
-
- # Verify add_argument was called for each expected argument
- expected_arguments = [
- 'pulsar-host', 'pulsar-api-key', 'pulsar-listener',
- 'prometheus-url', 'port', 'timeout', 'api-token',
- 'log-level', 'metrics', 'metrics-port'
- ]
-
- # Check that add_argument was called multiple times (once for each arg)
- assert mock_parser.add_argument.call_count >= len(expected_arguments)
\ No newline at end of file
+ # auth.start must be first (before config receiver, before
+ # any endpoint starts).
+ assert order[0] == "auth"
+ # All three must have run.
+ assert set(order) == {"auth", "config", "endpoints"}
diff --git a/tests/unit/test_gateway/test_socket_graceful_shutdown.py b/tests/unit/test_gateway/test_socket_graceful_shutdown.py
index 1a63227d..6c3e323b 100644
--- a/tests/unit/test_gateway/test_socket_graceful_shutdown.py
+++ b/tests/unit/test_gateway/test_socket_graceful_shutdown.py
@@ -1,4 +1,15 @@
-"""Unit tests for SocketEndpoint graceful shutdown functionality."""
+"""Unit tests for SocketEndpoint graceful shutdown functionality.
+
+These tests exercise SocketEndpoint in its handshake-auth
+configuration (``in_band_auth=False``) — the mode used in production
+for the flow import/export streaming endpoints. The mux socket at
+``/api/v1/socket`` uses ``in_band_auth=True`` instead, where the
+handshake always accepts and authentication runs on the first
+WebSocket frame; that path is covered by the Mux tests.
+
+Every endpoint constructor here passes an explicit capability — no
+permissive default is relied upon.
+"""
import pytest
import asyncio
@@ -6,13 +17,32 @@ from unittest.mock import AsyncMock, MagicMock, patch
from aiohttp import web, WSMsgType
from trustgraph.gateway.endpoint.socket import SocketEndpoint
from trustgraph.gateway.running import Running
+from trustgraph.gateway.auth import Identity
+
+
+# Representative capability used across these tests — corresponds to
+# the flow-import streaming endpoint pattern that uses this class.
+TEST_CAP = "graph:write"
+
+
+def _valid_identity():
+ return Identity(
+ handle="test-user",
+ workspace="default",
+ principal_id="test-user",
+ source="api-key",
+ )
@pytest.fixture
def mock_auth():
- """Mock authentication service."""
+ """Mock IAM-backed authenticator. Successful by default —
+ ``authenticate`` returns a valid identity and ``authorise``
+ allows everything. Tests that need the failure paths override
+ the relevant attribute locally."""
auth = MagicMock()
- auth.permitted.return_value = True
+ auth.authenticate = AsyncMock(return_value=_valid_identity())
+ auth.authorise = AsyncMock(return_value=None)
return auth
@@ -25,7 +55,7 @@ def mock_dispatcher_factory():
dispatcher.receive = AsyncMock()
dispatcher.destroy = AsyncMock()
return dispatcher
-
+
return dispatcher_factory
@@ -35,7 +65,8 @@ def socket_endpoint(mock_auth, mock_dispatcher_factory):
return SocketEndpoint(
endpoint_path="/test-socket",
auth=mock_auth,
- dispatcher=mock_dispatcher_factory
+ dispatcher=mock_dispatcher_factory,
+ capability=TEST_CAP,
)
@@ -61,7 +92,10 @@ def mock_request():
@pytest.mark.asyncio
async def test_listener_graceful_shutdown_on_close():
"""Test listener handles websocket close gracefully."""
- socket_endpoint = SocketEndpoint("/test", MagicMock(), AsyncMock())
+ socket_endpoint = SocketEndpoint(
+ "/test", MagicMock(), AsyncMock(),
+ capability=TEST_CAP,
+ )
# Mock websocket that closes after one message
ws = AsyncMock()
@@ -99,9 +133,10 @@ async def test_listener_graceful_shutdown_on_close():
@pytest.mark.asyncio
async def test_handle_normal_flow():
- """Test normal websocket handling flow."""
+ """Valid bearer → handshake accepted, dispatcher created."""
mock_auth = MagicMock()
- mock_auth.permitted.return_value = True
+ mock_auth.authenticate = AsyncMock(return_value=_valid_identity())
+ mock_auth.authorise = AsyncMock(return_value=None)
dispatcher_created = False
async def mock_dispatcher_factory(ws, running, match_info):
@@ -111,7 +146,10 @@ async def test_handle_normal_flow():
dispatcher.destroy = AsyncMock()
return dispatcher
- socket_endpoint = SocketEndpoint("/test", mock_auth, mock_dispatcher_factory)
+ socket_endpoint = SocketEndpoint(
+ "/test", mock_auth, mock_dispatcher_factory,
+ capability=TEST_CAP,
+ )
request = MagicMock()
request.query = {"token": "valid-token"}
@@ -155,7 +193,8 @@ async def test_handle_normal_flow():
async def test_handle_exception_group_cleanup():
"""Test exception group triggers dispatcher cleanup."""
mock_auth = MagicMock()
- mock_auth.permitted.return_value = True
+ mock_auth.authenticate = AsyncMock(return_value=_valid_identity())
+ mock_auth.authorise = AsyncMock(return_value=None)
mock_dispatcher = AsyncMock()
mock_dispatcher.destroy = AsyncMock()
@@ -163,7 +202,10 @@ async def test_handle_exception_group_cleanup():
async def mock_dispatcher_factory(ws, running, match_info):
return mock_dispatcher
- socket_endpoint = SocketEndpoint("/test", mock_auth, mock_dispatcher_factory)
+ socket_endpoint = SocketEndpoint(
+ "/test", mock_auth, mock_dispatcher_factory,
+ capability=TEST_CAP,
+ )
request = MagicMock()
request.query = {"token": "valid-token"}
@@ -222,7 +264,8 @@ async def test_handle_exception_group_cleanup():
async def test_handle_dispatcher_cleanup_timeout():
"""Test dispatcher cleanup with timeout."""
mock_auth = MagicMock()
- mock_auth.permitted.return_value = True
+ mock_auth.authenticate = AsyncMock(return_value=_valid_identity())
+ mock_auth.authorise = AsyncMock(return_value=None)
# Mock dispatcher that takes long to destroy
mock_dispatcher = AsyncMock()
@@ -231,7 +274,10 @@ async def test_handle_dispatcher_cleanup_timeout():
async def mock_dispatcher_factory(ws, running, match_info):
return mock_dispatcher
- socket_endpoint = SocketEndpoint("/test", mock_auth, mock_dispatcher_factory)
+ socket_endpoint = SocketEndpoint(
+ "/test", mock_auth, mock_dispatcher_factory,
+ capability=TEST_CAP,
+ )
request = MagicMock()
request.query = {"token": "valid-token"}
@@ -285,49 +331,68 @@ async def test_handle_dispatcher_cleanup_timeout():
@pytest.mark.asyncio
async def test_handle_unauthorized_request():
- """Test handling of unauthorized requests."""
+ """A bearer that the IAM layer rejects causes the handshake to
+ fail with 401. IamAuth surfaces an HTTPUnauthorized; the
+ endpoint propagates it. Note that the endpoint intentionally
+ does NOT distinguish 'bad token', 'expired', 'revoked', etc. —
+ that's the IAM error-masking policy."""
mock_auth = MagicMock()
- mock_auth.permitted.return_value = False # Unauthorized
-
- socket_endpoint = SocketEndpoint("/test", mock_auth, AsyncMock())
-
+ mock_auth.authenticate = AsyncMock(side_effect=web.HTTPUnauthorized(
+ text='{"error":"auth failure"}',
+ content_type="application/json",
+ ))
+
+ socket_endpoint = SocketEndpoint(
+ "/test", mock_auth, AsyncMock(),
+ capability=TEST_CAP,
+ )
+
request = MagicMock()
request.query = {"token": "invalid-token"}
-
+
result = await socket_endpoint.handle(request)
-
- # Should return HTTP 401
+
assert isinstance(result, web.HTTPUnauthorized)
-
- # Should have checked permission
- mock_auth.permitted.assert_called_once_with("invalid-token", "socket")
+ # authenticate must have been invoked with a synthetic request
+ # carrying Bearer . The endpoint wraps the query-
+ # string token into an Authorization header for a uniform auth
+ # path — the IAM layer does not look at query strings directly.
+ mock_auth.authenticate.assert_called_once()
+ passed_req = mock_auth.authenticate.call_args.args[0]
+ assert passed_req.headers["Authorization"] == "Bearer invalid-token"
@pytest.mark.asyncio
async def test_handle_missing_token():
- """Test handling of requests with missing token."""
+ """Request with no ``token`` query param → 401 before any
+ IAM call is made (cheap short-circuit)."""
mock_auth = MagicMock()
- mock_auth.permitted.return_value = False
-
- socket_endpoint = SocketEndpoint("/test", mock_auth, AsyncMock())
-
+ mock_auth.authenticate = AsyncMock(
+ side_effect=AssertionError(
+ "authenticate must not be invoked when no token is present"
+ ),
+ )
+
+ socket_endpoint = SocketEndpoint(
+ "/test", mock_auth, AsyncMock(),
+ capability=TEST_CAP,
+ )
+
request = MagicMock()
request.query = {} # No token
-
+
result = await socket_endpoint.handle(request)
-
- # Should return HTTP 401
+
assert isinstance(result, web.HTTPUnauthorized)
-
- # Should have checked permission with empty token
- mock_auth.permitted.assert_called_once_with("", "socket")
+ mock_auth.authenticate.assert_not_called()
@pytest.mark.asyncio
async def test_handle_websocket_already_closed():
"""Test handling when websocket is already closed."""
mock_auth = MagicMock()
- mock_auth.permitted.return_value = True
+ mock_auth.authenticate = AsyncMock(return_value=_valid_identity())
+ mock_auth.authorise = AsyncMock(return_value=None)
mock_dispatcher = AsyncMock()
mock_dispatcher.destroy = AsyncMock()
@@ -335,7 +400,10 @@ async def test_handle_websocket_already_closed():
async def mock_dispatcher_factory(ws, running, match_info):
return mock_dispatcher
- socket_endpoint = SocketEndpoint("/test", mock_auth, mock_dispatcher_factory)
+ socket_endpoint = SocketEndpoint(
+ "/test", mock_auth, mock_dispatcher_factory,
+ capability=TEST_CAP,
+ )
request = MagicMock()
request.query = {"token": "valid-token"}
diff --git a/tests/unit/test_text_completion/test_ollama_processor.py b/tests/unit/test_text_completion/test_ollama_processor.py
index 69baf85f..35bf182a 100644
--- a/tests/unit/test_text_completion/test_ollama_processor.py
+++ b/tests/unit/test_text_completion/test_ollama_processor.py
@@ -15,13 +15,13 @@ from trustgraph.base import LlmResult
class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
"""Test Ollama processor functionality"""
- @patch('trustgraph.model.text_completion.ollama.llm.Client')
+ @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.llm_service.LlmService.__init__')
async def test_processor_initialization_basic(self, mock_llm_init, mock_async_init, mock_client_class):
"""Test basic processor initialization"""
# Arrange
- mock_client = MagicMock()
+ mock_client = AsyncMock()
mock_client_class.return_value = mock_client
# Mock the parent class initialization
@@ -44,13 +44,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
assert hasattr(processor, 'llm')
mock_client_class.assert_called_once_with(host='http://localhost:11434')
- @patch('trustgraph.model.text_completion.ollama.llm.Client')
+ @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.llm_service.LlmService.__init__')
async def test_generate_content_success(self, mock_llm_init, mock_async_init, mock_client_class):
"""Test successful content generation"""
# Arrange
- mock_client = MagicMock()
+ mock_client = AsyncMock()
mock_response = {
'response': 'Generated response from Ollama',
'prompt_eval_count': 15,
@@ -83,13 +83,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
assert result.model == 'llama2'
mock_client.generate.assert_called_once_with('llama2', "System prompt\n\nUser prompt", options={'temperature': 0.0})
- @patch('trustgraph.model.text_completion.ollama.llm.Client')
+ @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.llm_service.LlmService.__init__')
async def test_generate_content_generic_exception(self, mock_llm_init, mock_async_init, mock_client_class):
"""Test handling of generic exceptions"""
# Arrange
- mock_client = MagicMock()
+ mock_client = AsyncMock()
mock_client.generate.side_effect = Exception("Connection error")
mock_client_class.return_value = mock_client
@@ -110,13 +110,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
with pytest.raises(Exception, match="Connection error"):
await processor.generate_content("System prompt", "User prompt")
- @patch('trustgraph.model.text_completion.ollama.llm.Client')
+ @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.llm_service.LlmService.__init__')
async def test_processor_initialization_with_custom_parameters(self, mock_llm_init, mock_async_init, mock_client_class):
"""Test processor initialization with custom parameters"""
# Arrange
- mock_client = MagicMock()
+ mock_client = AsyncMock()
mock_client_class.return_value = mock_client
mock_async_init.return_value = None
@@ -137,13 +137,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
assert processor.default_model == 'mistral'
mock_client_class.assert_called_once_with(host='http://192.168.1.100:11434')
- @patch('trustgraph.model.text_completion.ollama.llm.Client')
+ @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.llm_service.LlmService.__init__')
async def test_processor_initialization_with_defaults(self, mock_llm_init, mock_async_init, mock_client_class):
"""Test processor initialization with default values"""
# Arrange
- mock_client = MagicMock()
+ mock_client = AsyncMock()
mock_client_class.return_value = mock_client
mock_async_init.return_value = None
@@ -164,13 +164,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
# Should use default_ollama (http://localhost:11434 or from OLLAMA_HOST env)
mock_client_class.assert_called_once()
- @patch('trustgraph.model.text_completion.ollama.llm.Client')
+ @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.llm_service.LlmService.__init__')
async def test_generate_content_empty_prompts(self, mock_llm_init, mock_async_init, mock_client_class):
"""Test content generation with empty prompts"""
# Arrange
- mock_client = MagicMock()
+ mock_client = AsyncMock()
mock_response = {
'response': 'Default response',
'prompt_eval_count': 2,
@@ -205,13 +205,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
# The prompt should be "" + "\n\n" + "" = "\n\n"
mock_client.generate.assert_called_once_with('llama2', "\n\n", options={'temperature': 0.0})
- @patch('trustgraph.model.text_completion.ollama.llm.Client')
+ @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.llm_service.LlmService.__init__')
async def test_generate_content_token_counting(self, mock_llm_init, mock_async_init, mock_client_class):
"""Test token counting from Ollama response"""
# Arrange
- mock_client = MagicMock()
+ mock_client = AsyncMock()
mock_response = {
'response': 'Test response',
'prompt_eval_count': 50,
@@ -243,13 +243,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
assert result.out_token == 25
assert result.model == 'llama2'
- @patch('trustgraph.model.text_completion.ollama.llm.Client')
+ @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.llm_service.LlmService.__init__')
async def test_ollama_client_initialization(self, mock_llm_init, mock_async_init, mock_client_class):
"""Test that Ollama client is initialized correctly"""
# Arrange
- mock_client = MagicMock()
+ mock_client = AsyncMock()
mock_client_class.return_value = mock_client
mock_async_init.return_value = None
@@ -273,13 +273,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
# Verify processor has the client
assert processor.llm == mock_client
- @patch('trustgraph.model.text_completion.ollama.llm.Client')
+ @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.llm_service.LlmService.__init__')
async def test_generate_content_prompt_construction(self, mock_llm_init, mock_async_init, mock_client_class):
"""Test prompt construction with system and user prompts"""
# Arrange
- mock_client = MagicMock()
+ mock_client = AsyncMock()
mock_response = {
'response': 'Response with system instructions',
'prompt_eval_count': 25,
@@ -312,13 +312,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
# Verify the combined prompt
mock_client.generate.assert_called_once_with('llama2', "You are a helpful assistant\n\nWhat is AI?", options={'temperature': 0.0})
- @patch('trustgraph.model.text_completion.ollama.llm.Client')
+ @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.llm_service.LlmService.__init__')
async def test_generate_content_temperature_override(self, mock_llm_init, mock_async_init, mock_client_class):
"""Test temperature parameter override functionality"""
# Arrange
- mock_client = MagicMock()
+ mock_client = AsyncMock()
mock_response = {
'response': 'Response with custom temperature',
'prompt_eval_count': 20,
@@ -360,13 +360,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
options={'temperature': 0.8} # Should use runtime override
)
- @patch('trustgraph.model.text_completion.ollama.llm.Client')
+ @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.llm_service.LlmService.__init__')
async def test_generate_content_model_override(self, mock_llm_init, mock_async_init, mock_client_class):
"""Test model parameter override functionality"""
# Arrange
- mock_client = MagicMock()
+ mock_client = AsyncMock()
mock_response = {
'response': 'Response with custom model',
'prompt_eval_count': 18,
@@ -408,13 +408,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase):
options={'temperature': 0.1} # Should use processor default
)
- @patch('trustgraph.model.text_completion.ollama.llm.Client')
+ @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient')
@patch('trustgraph.base.async_processor.AsyncProcessor.__init__')
@patch('trustgraph.base.llm_service.LlmService.__init__')
async def test_generate_content_both_parameters_override(self, mock_llm_init, mock_async_init, mock_client_class):
"""Test overriding both model and temperature parameters simultaneously"""
# Arrange
- mock_client = MagicMock()
+ mock_client = AsyncMock()
mock_response = {
'response': 'Response with both overrides',
'prompt_eval_count': 22,
diff --git a/trustgraph-base/trustgraph/api/async_socket_client.py b/trustgraph-base/trustgraph/api/async_socket_client.py
index e5d553ea..ca9146b9 100644
--- a/trustgraph-base/trustgraph/api/async_socket_client.py
+++ b/trustgraph-base/trustgraph/api/async_socket_client.py
@@ -49,21 +49,67 @@ class AsyncSocketClient:
return f"ws://{url}"
def _build_ws_url(self):
- ws_url = f"{self.url.rstrip('/')}/api/v1/socket"
- if self.token:
- ws_url = f"{ws_url}?token={self.token}"
- return ws_url
+ # /api/v1/socket uses the first-frame auth protocol — the
+ # token is sent as the first frame after connecting rather
+ # than in the URL. This avoids browser issues with 401 on
+ # the WebSocket handshake and lets long-lived sockets
+ # refresh credentials mid-session.
+ return f"{self.url.rstrip('/')}/api/v1/socket"
async def connect(self):
- """Establish the persistent websocket connection."""
+ """Establish the persistent websocket connection and run the
+ first-frame auth handshake."""
if self._connected:
return
+ if not self.token:
+ raise ProtocolException(
+ "AsyncSocketClient requires a token for first-frame "
+ "auth against /api/v1/socket"
+ )
+
ws_url = self._build_ws_url()
self._connect_cm = websockets.connect(
ws_url, ping_interval=20, ping_timeout=self.timeout
)
self._socket = await self._connect_cm.__aenter__()
+
+ # First-frame auth: send {"type":"auth","token":"..."} and
+ # wait for auth-ok / auth-failed. Run before starting the
+ # reader task so the response isn't consumed by the reader's
+ # id-based routing.
+ await self._socket.send(json.dumps({
+ "type": "auth", "token": self.token,
+ }))
+ try:
+ raw = await asyncio.wait_for(
+ self._socket.recv(), timeout=self.timeout,
+ )
+ except asyncio.TimeoutError:
+ await self._socket.close()
+ raise ProtocolException("Timeout waiting for auth response")
+
+ try:
+ resp = json.loads(raw)
+ except Exception:
+ await self._socket.close()
+ raise ProtocolException(
+ f"Unexpected non-JSON auth response: {raw!r}"
+ )
+
+ if resp.get("type") == "auth-ok":
+ self.workspace = resp.get("workspace", self.workspace)
+ elif resp.get("type") == "auth-failed":
+ await self._socket.close()
+ raise ProtocolException(
+ f"auth failure: {resp.get('error', 'unknown')}"
+ )
+ else:
+ await self._socket.close()
+ raise ProtocolException(
+ f"Unexpected auth response: {resp!r}"
+ )
+
self._connected = True
self._reader_task = asyncio.create_task(self._reader())
diff --git a/trustgraph-base/trustgraph/api/socket_client.py b/trustgraph-base/trustgraph/api/socket_client.py
index 4eade3e8..aeb15f85 100644
--- a/trustgraph-base/trustgraph/api/socket_client.py
+++ b/trustgraph-base/trustgraph/api/socket_client.py
@@ -112,10 +112,10 @@ class SocketClient:
return f"ws://{url}"
def _build_ws_url(self):
- ws_url = f"{self.url.rstrip('/')}/api/v1/socket"
- if self.token:
- ws_url = f"{ws_url}?token={self.token}"
- return ws_url
+ # /api/v1/socket uses the first-frame auth protocol — the
+ # token is sent as the first frame after connecting rather
+ # than in the URL.
+ return f"{self.url.rstrip('/')}/api/v1/socket"
def _get_loop(self):
"""Get or create the event loop, reusing across calls."""
@@ -132,15 +132,58 @@ class SocketClient:
return self._loop
async def _ensure_connected(self):
- """Lazily establish the persistent websocket connection."""
+ """Lazily establish the persistent websocket connection and
+ run the first-frame auth handshake."""
if self._connected:
return
+ if not self.token:
+ raise ProtocolException(
+ "SocketClient requires a token for first-frame auth "
+ "against /api/v1/socket"
+ )
+
ws_url = self._build_ws_url()
self._connect_cm = websockets.connect(
ws_url, ping_interval=20, ping_timeout=self.timeout
)
self._socket = await self._connect_cm.__aenter__()
+
+ # First-frame auth — run before starting the reader so the
+ # auth-ok / auth-failed response isn't consumed by the reader
+ # loop's id-based routing.
+ await self._socket.send(json.dumps({
+ "type": "auth", "token": self.token,
+ }))
+ try:
+ raw = await asyncio.wait_for(
+ self._socket.recv(), timeout=self.timeout,
+ )
+ except asyncio.TimeoutError:
+ await self._socket.close()
+ raise ProtocolException("Timeout waiting for auth response")
+
+ try:
+ resp = json.loads(raw)
+ except Exception:
+ await self._socket.close()
+ raise ProtocolException(
+ f"Unexpected non-JSON auth response: {raw!r}"
+ )
+
+ if resp.get("type") == "auth-ok":
+ self.workspace = resp.get("workspace", self.workspace)
+ elif resp.get("type") == "auth-failed":
+ await self._socket.close()
+ raise ProtocolException(
+ f"auth failure: {resp.get('error', 'unknown')}"
+ )
+ else:
+ await self._socket.close()
+ raise ProtocolException(
+ f"Unexpected auth response: {resp!r}"
+ )
+
self._connected = True
self._reader_task = asyncio.create_task(self._reader())
diff --git a/trustgraph-base/trustgraph/base/config_client.py b/trustgraph-base/trustgraph/base/config_client.py
index 504a6d58..eb3892f8 100644
--- a/trustgraph-base/trustgraph/base/config_client.py
+++ b/trustgraph-base/trustgraph/base/config_client.py
@@ -84,6 +84,18 @@ class ConfigClient(RequestResponse):
)
return resp.directory
+ async def get_all(self, workspace, timeout=CONFIG_TIMEOUT):
+ """Return every config entry in ``workspace`` as a nested dict
+ ``{type: {key: value}}``. Values are returned as the raw
+ strings stored by config-svc (typically JSON); callers parse
+ as needed. An empty dict means the workspace has no config."""
+ resp = await self._request(
+ operation="config",
+ workspace=workspace,
+ timeout=timeout,
+ )
+ return resp.config
+
async def workspaces_for_type(self, type, timeout=CONFIG_TIMEOUT):
"""Return the set of distinct workspaces with any config of
the given type."""
diff --git a/trustgraph-base/trustgraph/base/iam_client.py b/trustgraph-base/trustgraph/base/iam_client.py
new file mode 100644
index 00000000..4be59de1
--- /dev/null
+++ b/trustgraph-base/trustgraph/base/iam_client.py
@@ -0,0 +1,342 @@
+
+import json
+
+from . request_response_spec import RequestResponse, RequestResponseSpec
+from .. schema import (
+ IamRequest, IamResponse,
+ UserInput, WorkspaceInput, ApiKeyInput,
+)
+
+IAM_TIMEOUT = 10
+
+
+class IamClient(RequestResponse):
+ """Client for the IAM service request/response pub/sub protocol.
+
+ Mirrors ``ConfigClient``: a thin wrapper around ``RequestResponse``
+ that knows the IAM request / response schemas. Only the subset of
+ operations actually implemented by the server today has helper
+ methods here; callers that need an unimplemented operation can
+ build ``IamRequest`` and call ``request()`` directly.
+ """
+
+ async def _request(self, timeout=IAM_TIMEOUT, **kwargs):
+ resp = await self.request(
+ IamRequest(**kwargs),
+ timeout=timeout,
+ )
+ if resp.error:
+ raise RuntimeError(
+ f"{resp.error.type}: {resp.error.message}"
+ )
+ return resp
+
+ async def bootstrap(self, timeout=IAM_TIMEOUT):
+ """Initial-run IAM self-seed. Returns a tuple of
+ ``(admin_user_id, admin_api_key_plaintext)``. Both are empty
+ strings on repeat calls — the operation is a no-op once the
+ IAM tables are populated."""
+ resp = await self._request(
+ operation="bootstrap", timeout=timeout,
+ )
+ return resp.bootstrap_admin_user_id, resp.bootstrap_admin_api_key
+
+ async def bootstrap_status(self, timeout=IAM_TIMEOUT):
+ """Returns whether an unconsumed ``bootstrap`` call would
+ currently succeed (i.e. iam-svc is in ``bootstrap`` mode and
+ its tables are empty). Side-effect-free; intended for first-
+ run UX so a UI can decide whether to render setup."""
+ resp = await self._request(
+ operation="bootstrap-status", timeout=timeout,
+ )
+ return resp.bootstrap_available
+
+ async def whoami(self, actor, timeout=IAM_TIMEOUT):
+ """Return the user record for ``actor`` (the authenticated
+ caller's handle). AUTHENTICATED-only; no capability check —
+ every authenticated user can read themselves."""
+ resp = await self._request(
+ operation="whoami",
+ actor=actor,
+ timeout=timeout,
+ )
+ return resp.user
+
+ async def resolve_api_key(self, api_key, timeout=IAM_TIMEOUT):
+ """Resolve a plaintext API key to its identity triple.
+
+ Returns ``(user_id, workspace, roles)`` or raises
+ ``RuntimeError`` with error type ``auth-failed`` if the key is
+ unknown / expired / revoked.
+
+ Note: the ``roles`` value is a regime-internal hint and is
+ not used by the gateway directly under the IAM contract;
+ all authorisation decisions go through ``authorise()``.
+ Returned here only for backward compatibility with callers
+ that haven't migrated."""
+ resp = await self._request(
+ operation="resolve-api-key",
+ api_key=api_key,
+ timeout=timeout,
+ )
+ return (
+ resp.resolved_user_id,
+ resp.resolved_workspace,
+ list(resp.resolved_roles),
+ )
+
+ async def authorise(self, identity_handle, capability,
+ resource, parameters, timeout=IAM_TIMEOUT):
+ """Ask the IAM regime whether ``identity_handle`` may perform
+ ``capability`` on ``resource`` given ``parameters``.
+
+ Implements the contract ``authorise(identity, capability,
+ resource, parameters) → (decision, ttl)``. Returns a tuple
+ ``(allow: bool, ttl_seconds: int)``. The TTL is the
+ regime's suggested cache lifetime for this decision; the
+ gateway honours it (clamped above by gateway-side policy)."""
+ resp = await self._request(
+ operation="authorise",
+ user_id=identity_handle,
+ capability=capability,
+ resource_json=json.dumps(resource or {}, sort_keys=True),
+ parameters_json=json.dumps(parameters or {}, sort_keys=True),
+ timeout=timeout,
+ )
+ return resp.decision_allow, resp.decision_ttl_seconds
+
+ async def authorise_many(self, identity_handle, checks,
+ timeout=IAM_TIMEOUT):
+ """Bulk authorise. ``checks`` is a list of dicts each
+ carrying ``capability``, ``resource``, and ``parameters``.
+ Returns a list of ``(allow, ttl)`` tuples in the same order."""
+ resp = await self._request(
+ operation="authorise-many",
+ user_id=identity_handle,
+ authorise_checks=json.dumps(list(checks), sort_keys=True),
+ timeout=timeout,
+ )
+ decisions = json.loads(resp.decisions_json or "[]")
+ return [(d.get("allow", False), d.get("ttl", 0)) for d in decisions]
+
+ async def create_user(self, workspace, user, actor="",
+ timeout=IAM_TIMEOUT):
+ """Create a user. ``user`` is a ``UserInput``."""
+ resp = await self._request(
+ operation="create-user",
+ workspace=workspace,
+ actor=actor,
+ user=user,
+ timeout=timeout,
+ )
+ return resp.user
+
+ async def list_users(self, workspace, actor="", timeout=IAM_TIMEOUT):
+ resp = await self._request(
+ operation="list-users",
+ workspace=workspace,
+ actor=actor,
+ timeout=timeout,
+ )
+ return list(resp.users)
+
+ async def create_api_key(self, workspace, key, actor="",
+ timeout=IAM_TIMEOUT):
+ """Create an API key. ``key`` is an ``ApiKeyInput``. Returns
+ ``(plaintext, record)`` — plaintext is returned once and the
+ caller is responsible for surfacing it to the operator."""
+ resp = await self._request(
+ operation="create-api-key",
+ workspace=workspace,
+ actor=actor,
+ key=key,
+ timeout=timeout,
+ )
+ return resp.api_key_plaintext, resp.api_key
+
+ async def list_api_keys(self, workspace, user_id, actor="",
+ timeout=IAM_TIMEOUT):
+ resp = await self._request(
+ operation="list-api-keys",
+ workspace=workspace,
+ actor=actor,
+ user_id=user_id,
+ timeout=timeout,
+ )
+ return list(resp.api_keys)
+
+ async def revoke_api_key(self, workspace, key_id, actor="",
+ timeout=IAM_TIMEOUT):
+ await self._request(
+ operation="revoke-api-key",
+ workspace=workspace,
+ actor=actor,
+ key_id=key_id,
+ timeout=timeout,
+ )
+
+ async def login(self, username, password, workspace="",
+ timeout=IAM_TIMEOUT):
+ """Validate credentials and return ``(jwt, expires_iso)``.
+ ``workspace`` is optional; defaults at the server to the
+ OSS default workspace."""
+ resp = await self._request(
+ operation="login",
+ workspace=workspace,
+ username=username,
+ password=password,
+ timeout=timeout,
+ )
+ return resp.jwt, resp.jwt_expires
+
+ async def get_signing_key_public(self, timeout=IAM_TIMEOUT):
+ """Return the active JWT signing public key in PEM. The
+ gateway calls this at startup and caches the result."""
+ resp = await self._request(
+ operation="get-signing-key-public",
+ timeout=timeout,
+ )
+ return resp.signing_key_public
+
+ async def change_password(self, user_id, current_password,
+ new_password, timeout=IAM_TIMEOUT):
+ await self._request(
+ operation="change-password",
+ user_id=user_id,
+ password=current_password,
+ new_password=new_password,
+ timeout=timeout,
+ )
+
+ async def reset_password(self, workspace, user_id, actor="",
+ timeout=IAM_TIMEOUT):
+ """Admin-driven password reset. Returns the plaintext
+ temporary password (returned once)."""
+ resp = await self._request(
+ operation="reset-password",
+ workspace=workspace,
+ actor=actor,
+ user_id=user_id,
+ timeout=timeout,
+ )
+ return resp.temporary_password
+
+ async def get_user(self, workspace, user_id, actor="",
+ timeout=IAM_TIMEOUT):
+ resp = await self._request(
+ operation="get-user",
+ workspace=workspace,
+ actor=actor,
+ user_id=user_id,
+ timeout=timeout,
+ )
+ return resp.user
+
+ async def update_user(self, workspace, user_id, user, actor="",
+ timeout=IAM_TIMEOUT):
+ resp = await self._request(
+ operation="update-user",
+ workspace=workspace,
+ actor=actor,
+ user_id=user_id,
+ user=user,
+ timeout=timeout,
+ )
+ return resp.user
+
+ async def disable_user(self, workspace, user_id, actor="",
+ timeout=IAM_TIMEOUT):
+ await self._request(
+ operation="disable-user",
+ workspace=workspace,
+ actor=actor,
+ user_id=user_id,
+ timeout=timeout,
+ )
+
+ async def enable_user(self, workspace, user_id, actor="",
+ timeout=IAM_TIMEOUT):
+ await self._request(
+ operation="enable-user",
+ workspace=workspace,
+ actor=actor,
+ user_id=user_id,
+ timeout=timeout,
+ )
+
+ async def delete_user(self, workspace, user_id, actor="",
+ timeout=IAM_TIMEOUT):
+ await self._request(
+ operation="delete-user",
+ workspace=workspace,
+ actor=actor,
+ user_id=user_id,
+ timeout=timeout,
+ )
+
+ async def create_workspace(self, workspace_record, actor="",
+ timeout=IAM_TIMEOUT):
+ resp = await self._request(
+ operation="create-workspace",
+ actor=actor,
+ workspace_record=workspace_record,
+ timeout=timeout,
+ )
+ return resp.workspace
+
+ async def list_workspaces(self, actor="", timeout=IAM_TIMEOUT):
+ resp = await self._request(
+ operation="list-workspaces",
+ actor=actor,
+ timeout=timeout,
+ )
+ return list(resp.workspaces)
+
+ async def get_workspace(self, workspace_id, actor="",
+ timeout=IAM_TIMEOUT):
+ from ..schema import WorkspaceInput
+ resp = await self._request(
+ operation="get-workspace",
+ actor=actor,
+ workspace_record=WorkspaceInput(id=workspace_id),
+ timeout=timeout,
+ )
+ return resp.workspace
+
+ async def update_workspace(self, workspace_record, actor="",
+ timeout=IAM_TIMEOUT):
+ resp = await self._request(
+ operation="update-workspace",
+ actor=actor,
+ workspace_record=workspace_record,
+ timeout=timeout,
+ )
+ return resp.workspace
+
+ async def disable_workspace(self, workspace_id, actor="",
+ timeout=IAM_TIMEOUT):
+ from ..schema import WorkspaceInput
+ await self._request(
+ operation="disable-workspace",
+ actor=actor,
+ workspace_record=WorkspaceInput(id=workspace_id),
+ timeout=timeout,
+ )
+
+ async def rotate_signing_key(self, actor="", timeout=IAM_TIMEOUT):
+ await self._request(
+ operation="rotate-signing-key",
+ actor=actor,
+ timeout=timeout,
+ )
+
+
+class IamClientSpec(RequestResponseSpec):
+ def __init__(self, request_name, response_name):
+ super().__init__(
+ request_name=request_name,
+ request_schema=IamRequest,
+ response_name=response_name,
+ response_schema=IamResponse,
+ impl=IamClient,
+ )
diff --git a/trustgraph-base/trustgraph/messaging/__init__.py b/trustgraph-base/trustgraph/messaging/__init__.py
index 30f5061c..9fcfa6f7 100644
--- a/trustgraph-base/trustgraph/messaging/__init__.py
+++ b/trustgraph-base/trustgraph/messaging/__init__.py
@@ -15,6 +15,7 @@ from .translators.library import LibraryRequestTranslator, LibraryResponseTransl
from .translators.document_loading import DocumentTranslator, TextDocumentTranslator
from .translators.config import ConfigRequestTranslator, ConfigResponseTranslator
from .translators.flow import FlowRequestTranslator, FlowResponseTranslator
+from .translators.iam import IamRequestTranslator, IamResponseTranslator
from .translators.prompt import PromptRequestTranslator, PromptResponseTranslator
from .translators.tool import ToolRequestTranslator, ToolResponseTranslator
from .translators.embeddings_query import (
@@ -85,11 +86,17 @@ TranslatorRegistry.register_service(
)
TranslatorRegistry.register_service(
- "flow",
- FlowRequestTranslator(),
+ "flow",
+ FlowRequestTranslator(),
FlowResponseTranslator()
)
+TranslatorRegistry.register_service(
+ "iam",
+ IamRequestTranslator(),
+ IamResponseTranslator()
+)
+
TranslatorRegistry.register_service(
"prompt",
PromptRequestTranslator(),
diff --git a/trustgraph-base/trustgraph/messaging/translators/iam.py b/trustgraph-base/trustgraph/messaging/translators/iam.py
new file mode 100644
index 00000000..1d7bf21c
--- /dev/null
+++ b/trustgraph-base/trustgraph/messaging/translators/iam.py
@@ -0,0 +1,198 @@
+from typing import Dict, Any, Tuple
+
+from ...schema import IamRequest, IamResponse
+from ...schema import (
+ UserInput, UserRecord,
+ WorkspaceInput, WorkspaceRecord,
+ ApiKeyInput, ApiKeyRecord,
+)
+from .base import MessageTranslator
+
+
+def _user_input_from_dict(d):
+ if d is None:
+ return None
+ return UserInput(
+ username=d.get("username", ""),
+ name=d.get("name", ""),
+ email=d.get("email", ""),
+ password=d.get("password", ""),
+ roles=list(d.get("roles", [])),
+ enabled=d.get("enabled", True),
+ must_change_password=d.get("must_change_password", False),
+ )
+
+
+def _workspace_input_from_dict(d):
+ if d is None:
+ return None
+ return WorkspaceInput(
+ id=d.get("id", ""),
+ name=d.get("name", ""),
+ enabled=d.get("enabled", True),
+ )
+
+
+def _api_key_input_from_dict(d):
+ if d is None:
+ return None
+ return ApiKeyInput(
+ user_id=d.get("user_id", ""),
+ name=d.get("name", ""),
+ expires=d.get("expires", ""),
+ )
+
+
+def _user_record_to_dict(r):
+ if r is None:
+ return None
+ return {
+ "id": r.id,
+ "workspace": r.workspace,
+ "username": r.username,
+ "name": r.name,
+ "email": r.email,
+ "roles": list(r.roles),
+ "enabled": r.enabled,
+ "must_change_password": r.must_change_password,
+ "created": r.created,
+ }
+
+
+def _workspace_record_to_dict(r):
+ if r is None:
+ return None
+ return {
+ "id": r.id,
+ "name": r.name,
+ "enabled": r.enabled,
+ "created": r.created,
+ }
+
+
+def _api_key_record_to_dict(r):
+ if r is None:
+ return None
+ return {
+ "id": r.id,
+ "user_id": r.user_id,
+ "name": r.name,
+ "prefix": r.prefix,
+ "expires": r.expires,
+ "created": r.created,
+ "last_used": r.last_used,
+ }
+
+
+class IamRequestTranslator(MessageTranslator):
+
+ def decode(self, data: Dict[str, Any]) -> IamRequest:
+ return IamRequest(
+ operation=data.get("operation", ""),
+ workspace=data.get("workspace", ""),
+ actor=data.get("actor", ""),
+ user_id=data.get("user_id", ""),
+ username=data.get("username", ""),
+ key_id=data.get("key_id", ""),
+ api_key=data.get("api_key", ""),
+ password=data.get("password", ""),
+ new_password=data.get("new_password", ""),
+ user=_user_input_from_dict(data.get("user")),
+ workspace_record=_workspace_input_from_dict(
+ data.get("workspace_record")
+ ),
+ key=_api_key_input_from_dict(data.get("key")),
+ )
+
+ def encode(self, obj: IamRequest) -> Dict[str, Any]:
+ result = {"operation": obj.operation}
+ for fname in (
+ "workspace", "actor", "user_id", "username", "key_id",
+ "api_key", "password", "new_password",
+ ):
+ v = getattr(obj, fname, "")
+ if v:
+ result[fname] = v
+ if obj.user is not None:
+ result["user"] = {
+ "username": obj.user.username,
+ "name": obj.user.name,
+ "email": obj.user.email,
+ "password": obj.user.password,
+ "roles": list(obj.user.roles),
+ "enabled": obj.user.enabled,
+ "must_change_password": obj.user.must_change_password,
+ }
+ if obj.workspace_record is not None:
+ result["workspace_record"] = {
+ "id": obj.workspace_record.id,
+ "name": obj.workspace_record.name,
+ "enabled": obj.workspace_record.enabled,
+ }
+ if obj.key is not None:
+ result["key"] = {
+ "user_id": obj.key.user_id,
+ "name": obj.key.name,
+ "expires": obj.key.expires,
+ }
+ return result
+
+
+class IamResponseTranslator(MessageTranslator):
+
+ def decode(self, data: Dict[str, Any]) -> IamResponse:
+ raise NotImplementedError(
+ "IamResponse is a server-produced message; no HTTP→schema "
+ "path is needed"
+ )
+
+ def encode(self, obj: IamResponse) -> Dict[str, Any]:
+ result: Dict[str, Any] = {}
+
+ if obj.user is not None:
+ result["user"] = _user_record_to_dict(obj.user)
+ if obj.users:
+ result["users"] = [_user_record_to_dict(u) for u in obj.users]
+ if obj.workspace is not None:
+ result["workspace"] = _workspace_record_to_dict(obj.workspace)
+ if obj.workspaces:
+ result["workspaces"] = [
+ _workspace_record_to_dict(w) for w in obj.workspaces
+ ]
+ if obj.api_key_plaintext:
+ result["api_key_plaintext"] = obj.api_key_plaintext
+ if obj.api_key is not None:
+ result["api_key"] = _api_key_record_to_dict(obj.api_key)
+ if obj.api_keys:
+ result["api_keys"] = [
+ _api_key_record_to_dict(k) for k in obj.api_keys
+ ]
+ if obj.jwt:
+ result["jwt"] = obj.jwt
+ if obj.jwt_expires:
+ result["jwt_expires"] = obj.jwt_expires
+ if obj.signing_key_public:
+ result["signing_key_public"] = obj.signing_key_public
+ if obj.resolved_user_id:
+ result["resolved_user_id"] = obj.resolved_user_id
+ if obj.resolved_workspace:
+ result["resolved_workspace"] = obj.resolved_workspace
+ if obj.resolved_roles:
+ result["resolved_roles"] = list(obj.resolved_roles)
+ if obj.temporary_password:
+ result["temporary_password"] = obj.temporary_password
+ if obj.bootstrap_admin_user_id:
+ result["bootstrap_admin_user_id"] = obj.bootstrap_admin_user_id
+ if obj.bootstrap_admin_api_key:
+ result["bootstrap_admin_api_key"] = obj.bootstrap_admin_api_key
+ # bootstrap-status: emit unconditionally — the false case is
+ # meaningful for UIs deciding whether to render first-run
+ # setup, so it can't be dropped by a truthy-only filter.
+ result["bootstrap_available"] = bool(obj.bootstrap_available)
+
+ return result
+
+ def encode_with_completion(
+ self, obj: IamResponse,
+ ) -> Tuple[Dict[str, Any], bool]:
+ return self.encode(obj), True
diff --git a/trustgraph-base/trustgraph/schema/services/__init__.py b/trustgraph-base/trustgraph/schema/services/__init__.py
index 550b7d12..2a214201 100644
--- a/trustgraph-base/trustgraph/schema/services/__init__.py
+++ b/trustgraph-base/trustgraph/schema/services/__init__.py
@@ -5,6 +5,7 @@ from .agent import *
from .flow import *
from .prompt import *
from .config import *
+from .iam import *
from .library import *
from .lookup import *
from .nlp_query import *
diff --git a/trustgraph-base/trustgraph/schema/services/iam.py b/trustgraph-base/trustgraph/schema/services/iam.py
new file mode 100644
index 00000000..797d6203
--- /dev/null
+++ b/trustgraph-base/trustgraph/schema/services/iam.py
@@ -0,0 +1,173 @@
+
+from dataclasses import dataclass, field
+
+from ..core.topic import queue
+from ..core.primitives import Error
+
+############################################################################
+
+# IAM service — see docs/tech-specs/iam-protocol.md for the full protocol.
+#
+# Transport: request/response pub/sub, correlated by the `id` message
+# property. Caller is the API gateway only; the IAM service trusts
+# the bus per the enforcement-boundary policy (no per-request auth
+# against the caller).
+
+
+@dataclass
+class UserInput:
+ username: str = ""
+ name: str = ""
+ email: str = ""
+ # Only populated on create-user; never on update-user.
+ password: str = ""
+ roles: list[str] = field(default_factory=list)
+ enabled: bool = True
+ must_change_password: bool = False
+
+
+@dataclass
+class UserRecord:
+ id: str = ""
+ workspace: str = ""
+ username: str = ""
+ name: str = ""
+ email: str = ""
+ roles: list[str] = field(default_factory=list)
+ enabled: bool = True
+ must_change_password: bool = False
+ created: str = ""
+
+
+@dataclass
+class WorkspaceInput:
+ id: str = ""
+ name: str = ""
+ enabled: bool = True
+
+
+@dataclass
+class WorkspaceRecord:
+ id: str = ""
+ name: str = ""
+ enabled: bool = True
+ created: str = ""
+
+
+@dataclass
+class ApiKeyInput:
+ user_id: str = ""
+ name: str = ""
+ expires: str = ""
+
+
+@dataclass
+class ApiKeyRecord:
+ id: str = ""
+ user_id: str = ""
+ name: str = ""
+ # First 4 chars of the plaintext token, for operator identification
+ # in list-api-keys. Never enough to reconstruct the key.
+ prefix: str = ""
+ expires: str = ""
+ created: str = ""
+ last_used: str = ""
+
+
+@dataclass
+class IamRequest:
+ operation: str = ""
+
+ # Workspace scope. Required on workspace-scoped operations;
+ # omitted for system-level ops (workspace CRUD, signing-key
+ # ops, bootstrap, resolve-api-key, login).
+ workspace: str = ""
+
+ # Acting user id for audit. Empty for internal-origin and for
+ # operations that resolve an identity (login, resolve-api-key).
+ actor: str = ""
+
+ user_id: str = ""
+ username: str = ""
+ key_id: str = ""
+ api_key: str = ""
+
+ password: str = ""
+ new_password: str = ""
+
+ user: UserInput | None = None
+ workspace_record: WorkspaceInput | None = None
+ key: ApiKeyInput | None = None
+
+ # ---- authorise / authorise-many inputs ----
+ # Capability string from the vocabulary in capabilities.md.
+ capability: str = ""
+ # Resource identifier as JSON. See the IAM contract spec for
+ # the resource-component vocabulary. An empty dict denotes a
+ # system-level resource.
+ resource_json: str = ""
+ # Operation parameters as JSON. Decision-relevant fields the
+ # operation supplied that are not part of the resource address
+ # (e.g. workspace association on create-user).
+ parameters_json: str = ""
+ # For authorise-many: a JSON-serialised list of
+ # {"capability": str, "resource": dict, "parameters": dict}.
+ authorise_checks: str = ""
+
+
+@dataclass
+class IamResponse:
+ user: UserRecord | None = None
+ users: list[UserRecord] = field(default_factory=list)
+
+ workspace: WorkspaceRecord | None = None
+ workspaces: list[WorkspaceRecord] = field(default_factory=list)
+
+ # create-api-key returns the plaintext once; never populated
+ # on any other operation.
+ api_key_plaintext: str = ""
+ api_key: ApiKeyRecord | None = None
+ api_keys: list[ApiKeyRecord] = field(default_factory=list)
+
+ # login, rotate-signing-key
+ jwt: str = ""
+ jwt_expires: str = ""
+
+ # get-signing-key-public
+ signing_key_public: str = ""
+
+ # resolve-api-key
+ resolved_user_id: str = ""
+ resolved_workspace: str = ""
+ resolved_roles: list[str] = field(default_factory=list)
+
+ # reset-password
+ temporary_password: str = ""
+
+ # bootstrap
+ bootstrap_admin_user_id: str = ""
+ bootstrap_admin_api_key: str = ""
+
+ # bootstrap-status — true iff iam-svc is in 'bootstrap' mode with
+ # empty tables, i.e. an unconsumed bootstrap call would succeed.
+ bootstrap_available: bool = False
+
+ # ---- authorise / authorise-many outputs ----
+ # authorise: the regime's allow / deny verdict.
+ decision_allow: bool = False
+ # Cache TTL the regime suggests, in seconds. Gateway respects
+ # this for both allow and deny decisions; bounded above by
+ # gateway-side policy (typically <= 60s).
+ decision_ttl_seconds: int = 0
+ # authorise-many: a JSON-serialised list of {"allow": bool,
+ # "ttl": int} in the same order as the request's
+ # authorise_checks.
+ decisions_json: str = ""
+
+ error: Error | None = None
+
+
+iam_request_queue = queue('iam', cls='request')
+iam_response_queue = queue('iam', cls='response')
+
+############################################################################
diff --git a/trustgraph-cli/pyproject.toml b/trustgraph-cli/pyproject.toml
index a5738449..e8062fba 100644
--- a/trustgraph-cli/pyproject.toml
+++ b/trustgraph-cli/pyproject.toml
@@ -40,7 +40,22 @@ tg-get-flow-blueprint = "trustgraph.cli.get_flow_blueprint:main"
tg-get-kg-core = "trustgraph.cli.get_kg_core:main"
tg-get-document-content = "trustgraph.cli.get_document_content:main"
tg-graph-to-turtle = "trustgraph.cli.graph_to_turtle:main"
-tg-init-trustgraph = "trustgraph.cli.init_trustgraph:main"
+tg-bootstrap-iam = "trustgraph.cli.bootstrap_iam:main"
+tg-login = "trustgraph.cli.login:main"
+tg-create-user = "trustgraph.cli.create_user:main"
+tg-list-users = "trustgraph.cli.list_users:main"
+tg-whoami = "trustgraph.cli.whoami:main"
+tg-update-user = "trustgraph.cli.update_user:main"
+tg-disable-user = "trustgraph.cli.disable_user:main"
+tg-enable-user = "trustgraph.cli.enable_user:main"
+tg-delete-user = "trustgraph.cli.delete_user:main"
+tg-change-password = "trustgraph.cli.change_password:main"
+tg-reset-password = "trustgraph.cli.reset_password:main"
+tg-create-api-key = "trustgraph.cli.create_api_key:main"
+tg-list-api-keys = "trustgraph.cli.list_api_keys:main"
+tg-revoke-api-key = "trustgraph.cli.revoke_api_key:main"
+tg-list-workspaces = "trustgraph.cli.list_workspaces:main"
+tg-create-workspace = "trustgraph.cli.create_workspace:main"
tg-invoke-agent = "trustgraph.cli.invoke_agent:main"
tg-invoke-document-rag = "trustgraph.cli.invoke_document_rag:main"
tg-invoke-graph-rag = "trustgraph.cli.invoke_graph_rag:main"
diff --git a/trustgraph-cli/trustgraph/cli/_iam.py b/trustgraph-cli/trustgraph/cli/_iam.py
new file mode 100644
index 00000000..f5278c0c
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/_iam.py
@@ -0,0 +1,75 @@
+"""
+Shared helpers for IAM CLI tools.
+
+All IAM operations go through the gateway's ``/api/v1/iam`` forwarder,
+with the three public auth operations (``login``, ``bootstrap``,
+``change-password``) served via ``/api/v1/auth/...`` instead. These
+helpers encapsulate the HTTP plumbing so each CLI can stay focused
+on its own argument parsing and output formatting.
+"""
+
+import json
+import os
+import sys
+
+import requests
+
+
+DEFAULT_URL = os.getenv("TRUSTGRAPH_URL", "http://localhost:8088/")
+DEFAULT_TOKEN = os.getenv("TRUSTGRAPH_TOKEN", None)
+
+
+def _fmt_error(resp_json):
+ err = resp_json.get("error", {})
+ if isinstance(err, dict):
+ t = err.get("type", "")
+ m = err.get("message", "")
+ return f"{t}: {m}" if t else m or "error"
+ return str(err)
+
+
+def _post(url, path, token, body):
+ endpoint = url.rstrip("/") + path
+ headers = {"Content-Type": "application/json"}
+ if token:
+ headers["Authorization"] = f"Bearer {token}"
+
+ resp = requests.post(
+ endpoint, headers=headers, data=json.dumps(body),
+ )
+
+ if resp.status_code != 200:
+ try:
+ payload = resp.json()
+ detail = _fmt_error(payload)
+ except Exception:
+ detail = resp.text
+ raise RuntimeError(f"HTTP {resp.status_code}: {detail}")
+
+ body = resp.json()
+ if "error" in body:
+ raise RuntimeError(_fmt_error(body))
+ return body
+
+
+def call_iam(url, token, request):
+ """Forward an IAM request through ``/api/v1/iam``. ``request`` is
+ the ``IamRequest`` dict shape."""
+ return _post(url, "/api/v1/iam", token, request)
+
+
+def call_auth(url, path, token, body):
+ """Hit one of the public auth endpoints
+ (``/api/v1/auth/login``, ``/api/v1/auth/change-password``, etc.).
+ ``token`` is optional — login and bootstrap don't need one."""
+ return _post(url, path, token, body)
+
+
+def run_main(fn, parser):
+ """Standard error-handling wrapper for CLI main() bodies."""
+ args = parser.parse_args()
+ try:
+ fn(args)
+ except Exception as e:
+ print("Exception:", e, file=sys.stderr, flush=True)
+ sys.exit(1)
diff --git a/trustgraph-cli/trustgraph/cli/bootstrap_iam.py b/trustgraph-cli/trustgraph/cli/bootstrap_iam.py
new file mode 100644
index 00000000..99a789e2
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/bootstrap_iam.py
@@ -0,0 +1,94 @@
+"""
+Bootstraps the IAM service. Only works when iam-svc is running in
+bootstrap mode with empty tables. Prints the initial admin API key
+to stdout.
+
+This is a one-time, trust-sensitive operation. The resulting token
+is shown once and never again — capture it on use. Rotate and
+revoke it as soon as a real admin API key has been issued.
+"""
+
+import argparse
+import json
+import os
+import sys
+
+import requests
+
+default_url = os.getenv("TRUSTGRAPH_URL", "http://localhost:8088/")
+
+
+def bootstrap(url):
+
+ # Unauthenticated public endpoint — IAM refuses the bootstrap
+ # operation unless the service is running in bootstrap mode with
+ # empty tables, so the safety gate lives on the server side.
+ endpoint = url.rstrip("/") + "/api/v1/auth/bootstrap"
+
+ headers = {"Content-Type": "application/json"}
+
+ resp = requests.post(
+ endpoint,
+ headers=headers,
+ data=json.dumps({}),
+ )
+
+ if resp.status_code != 200:
+ raise RuntimeError(
+ f"HTTP {resp.status_code}: {resp.text}"
+ )
+
+ body = resp.json()
+
+ if "error" in body:
+ raise RuntimeError(
+ f"IAM {body['error'].get('type', 'error')}: "
+ f"{body['error'].get('message', '')}"
+ )
+
+ api_key = body.get("bootstrap_admin_api_key")
+ user_id = body.get("bootstrap_admin_user_id")
+
+ if not api_key:
+ raise RuntimeError(
+ "IAM response did not contain a bootstrap token — the "
+ "service may already be bootstrapped, or may be running "
+ "in token mode."
+ )
+
+ return user_id, api_key
+
+
+def main():
+
+ parser = argparse.ArgumentParser(
+ prog="tg-bootstrap-iam",
+ description=__doc__,
+ )
+
+ parser.add_argument(
+ "-u", "--api-url",
+ default=default_url,
+ help=f"API URL (default: {default_url})",
+ )
+
+ args = parser.parse_args()
+
+ try:
+ user_id, api_key = bootstrap(args.api_url)
+ except Exception as e:
+ print("Exception:", e, file=sys.stderr, flush=True)
+ sys.exit(1)
+
+ # Stdout gets machine-readable output (the key). Any operator
+ # context goes to stderr.
+ print(f"Admin user id: {user_id}", file=sys.stderr)
+ print(
+ "Admin API key (shown once, capture now):",
+ file=sys.stderr,
+ )
+ print(api_key)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/trustgraph-cli/trustgraph/cli/change_password.py b/trustgraph-cli/trustgraph/cli/change_password.py
new file mode 100644
index 00000000..c914b30f
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/change_password.py
@@ -0,0 +1,46 @@
+"""
+Change your own password. Requires the current password.
+"""
+
+import argparse
+import getpass
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_auth, run_main
+
+
+def do_change_password(args):
+ current = args.current or getpass.getpass("Current password: ")
+ new = args.new or getpass.getpass("New password: ")
+
+ call_auth(
+ args.api_url, "/api/v1/auth/change-password", args.token,
+ {"current_password": current, "new_password": new},
+ )
+ print("Password changed.")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ prog="tg-change-password", description=__doc__,
+ )
+ parser.add_argument(
+ "-u", "--api-url", default=DEFAULT_URL,
+ help=f"API URL (default: {DEFAULT_URL})",
+ )
+ parser.add_argument(
+ "-t", "--token", default=DEFAULT_TOKEN,
+ help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+ )
+ parser.add_argument(
+ "--current", default=None,
+ help="Current password (prompted if omitted)",
+ )
+ parser.add_argument(
+ "--new", default=None,
+ help="New password (prompted if omitted)",
+ )
+ run_main(do_change_password, parser)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/trustgraph-cli/trustgraph/cli/create_api_key.py b/trustgraph-cli/trustgraph/cli/create_api_key.py
new file mode 100644
index 00000000..2b269041
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/create_api_key.py
@@ -0,0 +1,71 @@
+"""
+Create an API key for a user. Prints the plaintext key to stdout —
+shown once only.
+"""
+
+import argparse
+import sys
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_create_api_key(args):
+ key = {
+ "user_id": args.user_id,
+ "name": args.name,
+ }
+ if args.expires:
+ key["expires"] = args.expires
+
+ req = {"operation": "create-api-key", "key": key}
+ if args.workspace:
+ req["workspace"] = args.workspace
+ resp = call_iam(args.api_url, args.token, req)
+
+ plaintext = resp.get("api_key_plaintext", "")
+ rec = resp.get("api_key", {})
+ print(f"Key id: {rec.get('id', '')}", file=sys.stderr)
+ print(f"Name: {rec.get('name', '')}", file=sys.stderr)
+ print(f"Prefix: {rec.get('prefix', '')}", file=sys.stderr)
+ print(
+ "API key (shown once, capture now):", file=sys.stderr,
+ )
+ print(plaintext)
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ prog="tg-create-api-key", description=__doc__,
+ )
+ parser.add_argument(
+ "-u", "--api-url", default=DEFAULT_URL,
+ help=f"API URL (default: {DEFAULT_URL})",
+ )
+ parser.add_argument(
+ "-t", "--token", default=DEFAULT_TOKEN,
+ help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+ )
+ parser.add_argument(
+ "--user-id", required=True,
+ help="Owner user id",
+ )
+ parser.add_argument(
+ "--name", required=True,
+ help="Operator-facing label (e.g. 'laptop', 'ci')",
+ )
+ parser.add_argument(
+ "--expires", default=None,
+ help="ISO-8601 expiry (optional; empty = no expiry)",
+ )
+ parser.add_argument(
+ "-w", "--workspace", default=None,
+ help=(
+ "Target workspace (admin only; defaults to caller's "
+ "assigned workspace)"
+ ),
+ )
+ run_main(do_create_api_key, parser)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/trustgraph-cli/trustgraph/cli/create_user.py b/trustgraph-cli/trustgraph/cli/create_user.py
new file mode 100644
index 00000000..c9253aca
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/create_user.py
@@ -0,0 +1,87 @@
+"""
+Create a user in the caller's workspace. Prints the new user id.
+"""
+
+import argparse
+import getpass
+import sys
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_create_user(args):
+ password = args.password
+ if not password:
+ password = getpass.getpass(
+ f"Password for new user {args.username}: "
+ )
+
+ user = {
+ "username": args.username,
+ "password": password,
+ "roles": args.roles,
+ }
+ if args.name:
+ user["name"] = args.name
+ if args.email:
+ user["email"] = args.email
+ if args.must_change_password:
+ user["must_change_password"] = True
+
+ req = {"operation": "create-user", "user": user}
+ if args.workspace:
+ req["workspace"] = args.workspace
+ resp = call_iam(args.api_url, args.token, req)
+
+ rec = resp.get("user", {})
+ print(f"User id: {rec.get('id', '')}", file=sys.stderr)
+ print(f"Username: {rec.get('username', '')}", file=sys.stderr)
+ print(f"Roles: {', '.join(rec.get('roles', []))}", file=sys.stderr)
+ print(rec.get("id", ""))
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ prog="tg-create-user", description=__doc__,
+ )
+ parser.add_argument(
+ "-u", "--api-url", default=DEFAULT_URL,
+ help=f"API URL (default: {DEFAULT_URL})",
+ )
+ parser.add_argument(
+ "-t", "--token", default=DEFAULT_TOKEN,
+ help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+ )
+ parser.add_argument(
+ "--username", required=True, help="Username (unique in workspace)",
+ )
+ parser.add_argument(
+ "--password", default=None,
+ help="Password (prompted if omitted)",
+ )
+ parser.add_argument(
+ "--name", default=None, help="Display name",
+ )
+ parser.add_argument(
+ "--email", default=None, help="Email",
+ )
+ parser.add_argument(
+ "--roles", nargs="+", default=["reader"],
+ help="One or more role names (default: reader)",
+ )
+ parser.add_argument(
+ "--must-change-password", action="store_true",
+ help="Force password change on next login",
+ )
+ parser.add_argument(
+ "-w", "--workspace", default=None,
+ help=(
+ "Target workspace (admin only; defaults to caller's "
+ "assigned workspace)"
+ ),
+ )
+ run_main(do_create_user, parser)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/trustgraph-cli/trustgraph/cli/create_workspace.py b/trustgraph-cli/trustgraph/cli/create_workspace.py
new file mode 100644
index 00000000..f8367720
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/create_workspace.py
@@ -0,0 +1,46 @@
+"""
+Create a workspace (system-level; requires admin).
+"""
+
+import argparse
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_create_workspace(args):
+ ws = {"id": args.workspace_id, "enabled": True}
+ if args.name:
+ ws["name"] = args.name
+
+ resp = call_iam(args.api_url, args.token, {
+ "operation": "create-workspace",
+ "workspace_record": ws,
+ })
+ rec = resp.get("workspace", {})
+ print(f"Workspace created: {rec.get('id', '')}")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ prog="tg-create-workspace", description=__doc__,
+ )
+ parser.add_argument(
+ "-u", "--api-url", default=DEFAULT_URL,
+ help=f"API URL (default: {DEFAULT_URL})",
+ )
+ parser.add_argument(
+ "-t", "--token", default=DEFAULT_TOKEN,
+ help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+ )
+ parser.add_argument(
+ "--workspace-id", required=True,
+ help="New workspace id (must not start with '_')",
+ )
+ parser.add_argument(
+ "--name", default=None, help="Display name",
+ )
+ run_main(do_create_workspace, parser)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/trustgraph-cli/trustgraph/cli/delete_user.py b/trustgraph-cli/trustgraph/cli/delete_user.py
new file mode 100644
index 00000000..dbdf7877
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/delete_user.py
@@ -0,0 +1,62 @@
+"""
+Delete a user. Removes the user record, their username lookup,
+and all their API keys. The freed username becomes available for
+re-use.
+
+Irreversible. Use tg-disable-user if you want to preserve the
+record (audit trail, username squatting protection).
+"""
+
+import argparse
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_delete_user(args):
+ if not args.yes:
+ confirm = input(
+ f"Delete user {args.user_id}? This is irreversible. "
+ f"[type 'yes' to confirm]: "
+ )
+ if confirm.strip() != "yes":
+ print("Aborted.")
+ return
+
+ req = {"operation": "delete-user", "user_id": args.user_id}
+ if args.workspace:
+ req["workspace"] = args.workspace
+ call_iam(args.api_url, args.token, req)
+ print(f"Deleted user {args.user_id}")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ prog="tg-delete-user", description=__doc__,
+ )
+ parser.add_argument(
+ "-u", "--api-url", default=DEFAULT_URL,
+ help=f"API URL (default: {DEFAULT_URL})",
+ )
+ parser.add_argument(
+ "-t", "--token", default=DEFAULT_TOKEN,
+ help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+ )
+ parser.add_argument(
+ "--user-id", required=True, help="User id to delete",
+ )
+ parser.add_argument(
+ "-w", "--workspace", default=None,
+ help=(
+ "Target workspace (admin only; defaults to caller's "
+ "assigned workspace)"
+ ),
+ )
+ parser.add_argument(
+ "--yes", action="store_true",
+ help="Skip the interactive confirmation prompt",
+ )
+ run_main(do_delete_user, parser)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/trustgraph-cli/trustgraph/cli/disable_user.py b/trustgraph-cli/trustgraph/cli/disable_user.py
new file mode 100644
index 00000000..e142644b
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/disable_user.py
@@ -0,0 +1,45 @@
+"""
+Disable a user. Soft-deletes (enabled=false) and revokes all their
+API keys.
+"""
+
+import argparse
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_disable_user(args):
+ req = {"operation": "disable-user", "user_id": args.user_id}
+ if args.workspace:
+ req["workspace"] = args.workspace
+ call_iam(args.api_url, args.token, req)
+ print(f"Disabled user {args.user_id}")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ prog="tg-disable-user", description=__doc__,
+ )
+ parser.add_argument(
+ "-u", "--api-url", default=DEFAULT_URL,
+ help=f"API URL (default: {DEFAULT_URL})",
+ )
+ parser.add_argument(
+ "-t", "--token", default=DEFAULT_TOKEN,
+ help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+ )
+ parser.add_argument(
+ "--user-id", required=True, help="User id to disable",
+ )
+ parser.add_argument(
+ "-w", "--workspace", default=None,
+ help=(
+ "Target workspace (admin only; defaults to caller's "
+ "assigned workspace)"
+ ),
+ )
+ run_main(do_disable_user, parser)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/trustgraph-cli/trustgraph/cli/enable_user.py b/trustgraph-cli/trustgraph/cli/enable_user.py
new file mode 100644
index 00000000..c762366a
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/enable_user.py
@@ -0,0 +1,45 @@
+"""
+Re-enable a previously disabled user. Does not restore their API
+keys — those must be re-issued by an admin.
+"""
+
+import argparse
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_enable_user(args):
+ req = {"operation": "enable-user", "user_id": args.user_id}
+ if args.workspace:
+ req["workspace"] = args.workspace
+ call_iam(args.api_url, args.token, req)
+ print(f"Enabled user {args.user_id}")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ prog="tg-enable-user", description=__doc__,
+ )
+ parser.add_argument(
+ "-u", "--api-url", default=DEFAULT_URL,
+ help=f"API URL (default: {DEFAULT_URL})",
+ )
+ parser.add_argument(
+ "-t", "--token", default=DEFAULT_TOKEN,
+ help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+ )
+ parser.add_argument(
+ "--user-id", required=True, help="User id to enable",
+ )
+ parser.add_argument(
+ "-w", "--workspace", default=None,
+ help=(
+ "Target workspace (admin only; defaults to caller's "
+ "assigned workspace)"
+ ),
+ )
+ run_main(do_enable_user, parser)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/trustgraph-cli/trustgraph/cli/init_trustgraph.py b/trustgraph-cli/trustgraph/cli/init_trustgraph.py
deleted file mode 100644
index d984f925..00000000
--- a/trustgraph-cli/trustgraph/cli/init_trustgraph.py
+++ /dev/null
@@ -1,271 +0,0 @@
-"""
-Initialises TrustGraph pub/sub infrastructure and pushes initial config.
-
-For Pulsar: creates tenant, namespaces, and retention policies.
-For RabbitMQ: queues are auto-declared, so only config push is needed.
-"""
-
-import requests
-import time
-import argparse
-import json
-
-from trustgraph.clients.config_client import ConfigClient
-from trustgraph.base.pubsub import add_pubsub_args
-
-default_pulsar_admin_url = "http://pulsar:8080"
-subscriber = "tg-init-pubsub"
-
-
-def get_clusters(url):
-
- print("Get clusters...", flush=True)
-
- resp = requests.get(f"{url}/admin/v2/clusters")
-
- if resp.status_code != 200: raise RuntimeError("Could not fetch clusters")
-
- return resp.json()
-
-def ensure_tenant(url, tenant, clusters):
-
- resp = requests.get(f"{url}/admin/v2/tenants/{tenant}")
-
- if resp.status_code == 200:
- print(f"Tenant {tenant} already exists.", flush=True)
- return
-
- resp = requests.put(
- f"{url}/admin/v2/tenants/{tenant}",
- json={
- "adminRoles": [],
- "allowedClusters": clusters,
- }
- )
-
- if resp.status_code != 204:
- print(resp.text, flush=True)
- raise RuntimeError("Tenant creation failed.")
-
- print(f"Tenant {tenant} created.", flush=True)
-
-def ensure_namespace(url, tenant, namespace, config):
-
- resp = requests.get(f"{url}/admin/v2/namespaces/{tenant}/{namespace}")
-
- if resp.status_code == 200:
- print(f"Namespace {tenant}/{namespace} already exists.", flush=True)
- return
-
- resp = requests.put(
- f"{url}/admin/v2/namespaces/{tenant}/{namespace}",
- json=config,
- )
-
- if resp.status_code != 204:
- print(resp.status_code, flush=True)
- print(resp.text, flush=True)
- raise RuntimeError(f"Namespace {tenant}/{namespace} creation failed.")
-
- print(f"Namespace {tenant}/{namespace} created.", flush=True)
-
-def ensure_config(config, workspace="default", **pubsub_config):
-
- cli = ConfigClient(
- subscriber=subscriber,
- workspace=workspace,
- **pubsub_config,
- )
-
- while True:
-
- try:
-
- print("Get current config...", flush=True)
- current, version = cli.config(timeout=5)
-
- except Exception as e:
-
- print("Exception:", e, flush=True)
- time.sleep(2)
- print("Retrying...", flush=True)
- continue
-
- print("Current config version is", version, flush=True)
-
- if version != 0:
- print("Already updated, not updating config. Done.", flush=True)
- return
-
- print("Config is version 0, updating...", flush=True)
-
- batch = []
-
- for type in config:
- for key in config[type]:
- print(f"Adding {type}/{key} to update.", flush=True)
- batch.append({
- "type": type,
- "key": key,
- "value": json.dumps(config[type][key]),
- })
-
- try:
- cli.put(batch, timeout=10)
- print("Update succeeded.", flush=True)
- break
- except Exception as e:
- print("Exception:", e, flush=True)
- time.sleep(2)
- print("Retrying...", flush=True)
- continue
-
-def init_pulsar(pulsar_admin_url, tenant):
- """Pulsar-specific setup: create tenant, namespaces, retention policies."""
-
- clusters = get_clusters(pulsar_admin_url)
-
- ensure_tenant(pulsar_admin_url, tenant, clusters)
-
- ensure_namespace(pulsar_admin_url, tenant, "flow", {})
-
- ensure_namespace(pulsar_admin_url, tenant, "request", {})
-
- ensure_namespace(pulsar_admin_url, tenant, "response", {
- "retention_policies": {
- "retentionSizeInMB": -1,
- "retentionTimeInMinutes": 3,
- "subscriptionExpirationTimeMinutes": 30,
- }
- })
-
- ensure_namespace(pulsar_admin_url, tenant, "notify", {
- "retention_policies": {
- "retentionSizeInMB": -1,
- "retentionTimeInMinutes": 3,
- "subscriptionExpirationTimeMinutes": 5,
- }
- })
-
-
-def push_config(config_json, config_file, workspace="default",
- **pubsub_config):
- """Push initial config if provided."""
-
- if config_json is not None:
-
- try:
- print("Decoding config...", flush=True)
- dec = json.loads(config_json)
- print("Decoded.", flush=True)
- except Exception as e:
- print("Exception:", e, flush=True)
- raise e
-
- ensure_config(dec, workspace=workspace, **pubsub_config)
-
- elif config_file is not None:
-
- try:
- print("Decoding config...", flush=True)
- dec = json.load(open(config_file))
- print("Decoded.", flush=True)
- except Exception as e:
- print("Exception:", e, flush=True)
- raise e
-
- ensure_config(dec, workspace=workspace, **pubsub_config)
-
- else:
- print("No config to update.", flush=True)
-
-
-def main():
-
- parser = argparse.ArgumentParser(
- prog='tg-init-trustgraph',
- description=__doc__,
- )
-
- parser.add_argument(
- '--pulsar-admin-url',
- default=default_pulsar_admin_url,
- help=f'Pulsar admin URL (default: {default_pulsar_admin_url})',
- )
-
- parser.add_argument(
- '-c', '--config',
- help=f'Initial configuration to load',
- )
-
- parser.add_argument(
- '-C', '--config-file',
- help=f'Initial configuration to load from file',
- )
-
- parser.add_argument(
- '-t', '--tenant',
- default="tg",
- help=f'Tenant (default: tg)',
- )
-
- parser.add_argument(
- '-w', '--workspace',
- default="default",
- help=f'Workspace (default: default)',
- )
-
- add_pubsub_args(parser)
-
- args = parser.parse_args()
-
- backend_type = args.pubsub_backend
-
- # Extract pubsub config from args
- pubsub_config = {
- k: v for k, v in vars(args).items()
- if k not in (
- 'pulsar_admin_url', 'config', 'config_file', 'tenant',
- 'workspace',
- )
- }
-
- while True:
-
- try:
-
- # Pulsar-specific setup (tenants, namespaces)
- if backend_type == 'pulsar':
- print(flush=True)
- print(
- f"Initialising Pulsar at {args.pulsar_admin_url}...",
- flush=True,
- )
- init_pulsar(args.pulsar_admin_url, args.tenant)
- else:
- print(flush=True)
- print(
- f"Using {backend_type} backend (no admin setup needed).",
- flush=True,
- )
-
- # Push config (works with any backend)
- push_config(
- args.config, args.config_file,
- workspace=args.workspace,
- **pubsub_config,
- )
-
- print("Initialisation complete.", flush=True)
- break
-
- except Exception as e:
-
- print("Exception:", e, flush=True)
-
- print("Sleeping...", flush=True)
- time.sleep(2)
- print("Will retry...", flush=True)
-
-if __name__ == "__main__":
- main()
diff --git a/trustgraph-cli/trustgraph/cli/list_api_keys.py b/trustgraph-cli/trustgraph/cli/list_api_keys.py
new file mode 100644
index 00000000..f969890e
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/list_api_keys.py
@@ -0,0 +1,69 @@
+"""
+List the API keys for a user.
+"""
+
+import argparse
+
+import tabulate
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_list_api_keys(args):
+ req = {"operation": "list-api-keys", "user_id": args.user_id}
+ if args.workspace:
+ req["workspace"] = args.workspace
+ resp = call_iam(args.api_url, args.token, req)
+
+ keys = resp.get("api_keys", [])
+ if not keys:
+ print("No keys.")
+ return
+
+ rows = [
+ [
+ k.get("id", ""),
+ k.get("name", ""),
+ k.get("prefix", ""),
+ k.get("created", ""),
+ k.get("last_used", "") or "—",
+ k.get("expires", "") or "never",
+ ]
+ for k in keys
+ ]
+ print(tabulate.tabulate(
+ rows,
+ headers=["id", "name", "prefix", "created", "last used", "expires"],
+ tablefmt="pretty",
+ stralign="left",
+ ))
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ prog="tg-list-api-keys", description=__doc__,
+ )
+ parser.add_argument(
+ "-u", "--api-url", default=DEFAULT_URL,
+ help=f"API URL (default: {DEFAULT_URL})",
+ )
+ parser.add_argument(
+ "-t", "--token", default=DEFAULT_TOKEN,
+ help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+ )
+ parser.add_argument(
+ "--user-id", required=True,
+ help="Owner user id",
+ )
+ parser.add_argument(
+ "-w", "--workspace", default=None,
+ help=(
+ "Target workspace (admin only; defaults to caller's "
+ "assigned workspace)"
+ ),
+ )
+ run_main(do_list_api_keys, parser)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/trustgraph-cli/trustgraph/cli/list_users.py b/trustgraph-cli/trustgraph/cli/list_users.py
new file mode 100644
index 00000000..25bc1901
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/list_users.py
@@ -0,0 +1,65 @@
+"""
+List users in the caller's workspace.
+"""
+
+import argparse
+
+import tabulate
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_list_users(args):
+ req = {"operation": "list-users"}
+ if args.workspace:
+ req["workspace"] = args.workspace
+ resp = call_iam(args.api_url, args.token, req)
+
+ users = resp.get("users", [])
+ if not users:
+ print("No users.")
+ return
+
+ rows = [
+ [
+ u.get("id", ""),
+ u.get("username", ""),
+ u.get("name", ""),
+ ", ".join(u.get("roles", [])),
+ "yes" if u.get("enabled") else "no",
+ "yes" if u.get("must_change_password") else "no",
+ ]
+ for u in users
+ ]
+ print(tabulate.tabulate(
+ rows,
+ headers=["id", "username", "name", "roles", "enabled", "change-pw"],
+ tablefmt="pretty",
+ stralign="left",
+ ))
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ prog="tg-list-users", description=__doc__,
+ )
+ parser.add_argument(
+ "-u", "--api-url", default=DEFAULT_URL,
+ help=f"API URL (default: {DEFAULT_URL})",
+ )
+ parser.add_argument(
+ "-t", "--token", default=DEFAULT_TOKEN,
+ help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+ )
+ parser.add_argument(
+ "-w", "--workspace", default=None,
+ help=(
+ "Target workspace (admin only; defaults to caller's "
+ "assigned workspace)"
+ ),
+ )
+ run_main(do_list_users, parser)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/trustgraph-cli/trustgraph/cli/list_workspaces.py b/trustgraph-cli/trustgraph/cli/list_workspaces.py
new file mode 100644
index 00000000..170d330c
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/list_workspaces.py
@@ -0,0 +1,53 @@
+"""
+List workspaces (system-level; requires admin).
+"""
+
+import argparse
+
+import tabulate
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_list_workspaces(args):
+ resp = call_iam(
+ args.api_url, args.token, {"operation": "list-workspaces"},
+ )
+ workspaces = resp.get("workspaces", [])
+ if not workspaces:
+ print("No workspaces.")
+ return
+ rows = [
+ [
+ w.get("id", ""),
+ w.get("name", ""),
+ "yes" if w.get("enabled") else "no",
+ w.get("created", ""),
+ ]
+ for w in workspaces
+ ]
+ print(tabulate.tabulate(
+ rows,
+ headers=["id", "name", "enabled", "created"],
+ tablefmt="pretty",
+ stralign="left",
+ ))
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ prog="tg-list-workspaces", description=__doc__,
+ )
+ parser.add_argument(
+ "-u", "--api-url", default=DEFAULT_URL,
+ help=f"API URL (default: {DEFAULT_URL})",
+ )
+ parser.add_argument(
+ "-t", "--token", default=DEFAULT_TOKEN,
+ help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+ )
+ run_main(do_list_workspaces, parser)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/trustgraph-cli/trustgraph/cli/login.py b/trustgraph-cli/trustgraph/cli/login.py
new file mode 100644
index 00000000..0e87c3b0
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/login.py
@@ -0,0 +1,62 @@
+"""
+Log in with username / password. Prints the resulting JWT to
+stdout so it can be captured for subsequent CLI use.
+"""
+
+import argparse
+import getpass
+import sys
+
+from ._iam import DEFAULT_URL, call_auth, run_main
+
+
+def do_login(args):
+ password = args.password
+ if not password:
+ password = getpass.getpass(f"Password for {args.username}: ")
+
+ body = {
+ "username": args.username,
+ "password": password,
+ }
+ if args.workspace:
+ body["workspace"] = args.workspace
+
+ resp = call_auth(args.api_url, "/api/v1/auth/login", None, body)
+
+ jwt = resp.get("jwt", "")
+ expires = resp.get("jwt_expires", "")
+
+ if expires:
+ print(f"JWT expires: {expires}", file=sys.stderr)
+ # Machine-readable on stdout.
+ print(jwt)
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ prog="tg-login", description=__doc__,
+ )
+ parser.add_argument(
+ "-u", "--api-url", default=DEFAULT_URL,
+ help=f"API URL (default: {DEFAULT_URL})",
+ )
+ parser.add_argument(
+ "--username", required=True, help="Username",
+ )
+ parser.add_argument(
+ "--password", default=None,
+ help="Password (prompted if omitted)",
+ )
+ parser.add_argument(
+ "-w", "--workspace", default=None,
+ help=(
+ "Optional workspace to log in against. Defaults to "
+ "the user's assigned workspace."
+ ),
+ )
+ run_main(do_login, parser)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/trustgraph-cli/trustgraph/cli/reset_password.py b/trustgraph-cli/trustgraph/cli/reset_password.py
new file mode 100644
index 00000000..600f00e1
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/reset_password.py
@@ -0,0 +1,54 @@
+"""
+Admin: reset another user's password. Prints a one-time temporary
+password to stdout. The user is forced to change it on next login.
+"""
+
+import argparse
+import sys
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_reset_password(args):
+ req = {"operation": "reset-password", "user_id": args.user_id}
+ if args.workspace:
+ req["workspace"] = args.workspace
+ resp = call_iam(args.api_url, args.token, req)
+
+ tmp = resp.get("temporary_password", "")
+ if not tmp:
+ raise RuntimeError(
+ "IAM returned no temporary password — unexpected"
+ )
+ print("Temporary password (shown once, capture now):", file=sys.stderr)
+ print(tmp)
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ prog="tg-reset-password", description=__doc__,
+ )
+ parser.add_argument(
+ "-u", "--api-url", default=DEFAULT_URL,
+ help=f"API URL (default: {DEFAULT_URL})",
+ )
+ parser.add_argument(
+ "-t", "--token", default=DEFAULT_TOKEN,
+ help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+ )
+ parser.add_argument(
+ "--user-id", required=True,
+ help="Target user id",
+ )
+ parser.add_argument(
+ "-w", "--workspace", default=None,
+ help=(
+ "Target workspace (admin only; defaults to caller's "
+ "assigned workspace)"
+ ),
+ )
+ run_main(do_reset_password, parser)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/trustgraph-cli/trustgraph/cli/revoke_api_key.py b/trustgraph-cli/trustgraph/cli/revoke_api_key.py
new file mode 100644
index 00000000..3976b56f
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/revoke_api_key.py
@@ -0,0 +1,44 @@
+"""
+Revoke an API key by id.
+"""
+
+import argparse
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_revoke_api_key(args):
+ req = {"operation": "revoke-api-key", "key_id": args.key_id}
+ if args.workspace:
+ req["workspace"] = args.workspace
+ call_iam(args.api_url, args.token, req)
+ print(f"Revoked key {args.key_id}")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ prog="tg-revoke-api-key", description=__doc__,
+ )
+ parser.add_argument(
+ "-u", "--api-url", default=DEFAULT_URL,
+ help=f"API URL (default: {DEFAULT_URL})",
+ )
+ parser.add_argument(
+ "-t", "--token", default=DEFAULT_TOKEN,
+ help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+ )
+ parser.add_argument(
+ "--key-id", required=True, help="Key id to revoke",
+ )
+ parser.add_argument(
+ "-w", "--workspace", default=None,
+ help=(
+ "Target workspace (admin only; defaults to caller's "
+ "assigned workspace)"
+ ),
+ )
+ run_main(do_revoke_api_key, parser)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/trustgraph-cli/trustgraph/cli/show_flow_state.py b/trustgraph-cli/trustgraph/cli/show_flow_state.py
index 8fec04ec..3a733270 100644
--- a/trustgraph-cli/trustgraph/cli/show_flow_state.py
+++ b/trustgraph-cli/trustgraph/cli/show_flow_state.py
@@ -44,16 +44,18 @@ def show_processors(metrics_url, flow_label):
obj = resp.json()
- tbl = [
- [
- m["metric"]["job"],
- "\U0001f49a" if int(m["value"][1]) > 0 else "\U0000274c"
- ]
- for m in obj["data"]["result"]
- ]
+ # consumer_state is one sample per consumer (queue); a processor
+ # with N subscriptions shows up N times. Aggregate to one row per
+ # processor: green only if every consumer is running.
+ by_proc = {}
+ for m in obj["data"]["result"]:
+ name = m["metric"].get("processor", m["metric"]["job"])
+ running = int(m["value"][1]) > 0
+ by_proc[name] = by_proc.get(name, True) and running
- for row in tbl:
- print(f"- {row[0]:30} {row[1]}")
+ for name in sorted(by_proc):
+ icon = "\U0001f49a" if by_proc[name] else "\U0000274c"
+ print(f"- {name:30} {icon}")
def main():
diff --git a/trustgraph-cli/trustgraph/cli/show_processor_state.py b/trustgraph-cli/trustgraph/cli/show_processor_state.py
index b4ae4a16..9de05bc6 100644
--- a/trustgraph-cli/trustgraph/cli/show_processor_state.py
+++ b/trustgraph-cli/trustgraph/cli/show_processor_state.py
@@ -17,7 +17,7 @@ def dump_status(url):
tbl = [
[
- m["metric"]["job"],
+ m["metric"].get("processor", m["metric"]["job"]),
"\U0001f49a"
]
for m in obj["data"]["result"]
diff --git a/trustgraph-cli/trustgraph/cli/update_user.py b/trustgraph-cli/trustgraph/cli/update_user.py
new file mode 100644
index 00000000..5c1dc4d7
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/update_user.py
@@ -0,0 +1,125 @@
+"""
+Update a user's profile fields: name, email, roles, enabled flag,
+must-change-password flag.
+
+Username is immutable — create a new user and disable the old one
+to effect a username change. Password changes go through
+``tg-change-password`` (self-service) or ``tg-reset-password``
+(admin-driven).
+
+Only the fields you supply are changed; omitted fields are left
+untouched on the user record. An empty ``--roles`` is rejected by
+iam-svc (a user must have at least one role); to demote a user use
+``tg-disable-user``.
+"""
+
+import argparse
+import sys
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def _parse_bool(s):
+ if s is None:
+ return None
+ s = s.strip().lower()
+ if s in ("yes", "y", "true", "t", "1"):
+ return True
+ if s in ("no", "n", "false", "f", "0"):
+ return False
+ raise argparse.ArgumentTypeError(
+ f"expected yes/no, got {s!r}"
+ )
+
+
+def do_update_user(args):
+ user = {}
+ if args.name is not None:
+ user["name"] = args.name
+ if args.email is not None:
+ user["email"] = args.email
+ if args.roles is not None:
+ user["roles"] = args.roles
+ if args.enabled is not None:
+ user["enabled"] = args.enabled
+ if args.must_change_password is not None:
+ user["must_change_password"] = args.must_change_password
+
+ if not user:
+ print(
+ "tg-update-user: nothing to change — supply at least "
+ "one of --name / --email / --roles / --enabled / "
+ "--must-change-password",
+ file=sys.stderr,
+ )
+ sys.exit(2)
+
+ req = {
+ "operation": "update-user",
+ "user_id": args.user_id,
+ "user": user,
+ }
+ if args.workspace:
+ req["workspace"] = args.workspace
+ resp = call_iam(args.api_url, args.token, req)
+
+ rec = resp.get("user", {})
+ print(f"id : {rec.get('id', '')}")
+ print(f"username : {rec.get('username', '')}")
+ print(f"name : {rec.get('name', '')}")
+ print(f"email : {rec.get('email', '')}")
+ print(f"workspace : {rec.get('workspace', '')}")
+ print(f"roles : {', '.join(rec.get('roles', []))}")
+ print(f"enabled : {'yes' if rec.get('enabled') else 'no'}")
+ print(
+ f"must-change-pw: "
+ f"{'yes' if rec.get('must_change_password') else 'no'}"
+ )
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ prog="tg-update-user", description=__doc__,
+ )
+ parser.add_argument(
+ "-u", "--api-url", default=DEFAULT_URL,
+ help=f"API URL (default: {DEFAULT_URL})",
+ )
+ parser.add_argument(
+ "-t", "--token", default=DEFAULT_TOKEN,
+ help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+ )
+ parser.add_argument(
+ "--user-id", required=True, help="Target user id",
+ )
+ parser.add_argument(
+ "--name", default=None, help="New display name",
+ )
+ parser.add_argument(
+ "--email", default=None, help="New email",
+ )
+ parser.add_argument(
+ "--roles", nargs="+", default=None,
+ help="Replacement role list (e.g. --roles reader writer)",
+ )
+ parser.add_argument(
+ "--enabled", type=_parse_bool, default=None,
+ help="Set enabled flag (yes/no)",
+ )
+ parser.add_argument(
+ "--must-change-password", type=_parse_bool, default=None,
+ help="Set must-change-password flag (yes/no)",
+ )
+ parser.add_argument(
+ "-w", "--workspace", default=None,
+ help=(
+ "Optional workspace integrity check — when supplied, "
+ "iam-svc verifies the target user's home workspace "
+ "matches"
+ ),
+ )
+ run_main(do_update_user, parser)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/trustgraph-cli/trustgraph/cli/whoami.py b/trustgraph-cli/trustgraph/cli/whoami.py
new file mode 100644
index 00000000..1799685d
--- /dev/null
+++ b/trustgraph-cli/trustgraph/cli/whoami.py
@@ -0,0 +1,52 @@
+"""
+Show the authenticated caller's own user record.
+"""
+
+import argparse
+
+import tabulate
+
+from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main
+
+
+def do_whoami(args):
+ resp = call_iam(args.api_url, args.token, {"operation": "whoami"})
+ user = resp.get("user")
+ if not user:
+ print("(no user record returned)")
+ return
+
+ rows = [
+ ["id", user.get("id", "")],
+ ["username", user.get("username", "")],
+ ["name", user.get("name", "")],
+ ["email", user.get("email", "")],
+ ["workspace", user.get("workspace", "")],
+ ["roles", ", ".join(user.get("roles", []))],
+ ["enabled", "yes" if user.get("enabled") else "no"],
+ [
+ "must change password",
+ "yes" if user.get("must_change_password") else "no",
+ ],
+ ["created", user.get("created", "")],
+ ]
+ print(tabulate.tabulate(rows, tablefmt="plain"))
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ prog="tg-whoami", description=__doc__,
+ )
+ parser.add_argument(
+ "-u", "--api-url", default=DEFAULT_URL,
+ help=f"API URL (default: {DEFAULT_URL})",
+ )
+ parser.add_argument(
+ "-t", "--token", default=DEFAULT_TOKEN,
+ help="Auth token (default: $TRUSTGRAPH_TOKEN)",
+ )
+ run_main(do_whoami, parser)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/trustgraph-flow/pyproject.toml b/trustgraph-flow/pyproject.toml
index 8ba85adf..d8c690b5 100644
--- a/trustgraph-flow/pyproject.toml
+++ b/trustgraph-flow/pyproject.toml
@@ -60,8 +60,10 @@ agent-orchestrator = "trustgraph.agent.orchestrator:run"
api-gateway = "trustgraph.gateway:run"
chunker-recursive = "trustgraph.chunking.recursive:run"
chunker-token = "trustgraph.chunking.token:run"
+bootstrap = "trustgraph.bootstrap.bootstrapper:run"
config-svc = "trustgraph.config.service:run"
flow-svc = "trustgraph.flow.service:run"
+iam-svc = "trustgraph.iam.service:run"
doc-embeddings-query-milvus = "trustgraph.query.doc_embeddings.milvus:run"
doc-embeddings-query-pinecone = "trustgraph.query.doc_embeddings.pinecone:run"
doc-embeddings-query-qdrant = "trustgraph.query.doc_embeddings.qdrant:run"
diff --git a/trustgraph-flow/trustgraph/bootstrap/__init__.py b/trustgraph-flow/trustgraph/bootstrap/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/trustgraph-flow/trustgraph/bootstrap/base.py b/trustgraph-flow/trustgraph/bootstrap/base.py
new file mode 100644
index 00000000..cb022a16
--- /dev/null
+++ b/trustgraph-flow/trustgraph/bootstrap/base.py
@@ -0,0 +1,68 @@
+"""
+Bootstrap framework: Initialiser base class and per-wake context.
+
+See docs/tech-specs/bootstrap.md for the full design.
+"""
+
+import logging
+from dataclasses import dataclass
+from typing import Any
+
+
+@dataclass
+class InitContext:
+ """Shared per-wake context passed to each initialiser.
+
+ The bootstrapper constructs one of these on every wake cycle,
+ tears it down at cycle end, and passes it into each initialiser's
+ ``run()`` method. Fields are short-lived and safe to use during
+ a single cycle only.
+ """
+
+ logger: logging.Logger
+ config: Any # ConfigClient
+ flow: Any # RequestResponse client for flow-svc
+
+
+class Initialiser:
+ """Base class for bootstrap initialisers.
+
+ Subclasses implement :meth:`run`. The bootstrapper manages
+ completion state, flag comparison, retry and error handling —
+ subclasses describe only the work to perform.
+
+ Class attributes:
+
+ * ``wait_for_services`` (bool, default ``True``): when ``True`` the
+ initialiser only runs after the bootstrapper's service gate has
+ passed (config-svc and flow-svc reachable). Set ``False`` for
+ initialisers that bring up infrastructure the gate itself
+ depends on — principally Pulsar topology, without which
+ config-svc cannot come online.
+ """
+
+ wait_for_services: bool = True
+
+ def __init__(self, **params):
+ # Subclasses should consume their own params via keyword
+ # arguments in their own __init__ signatures. This catch-all
+ # is here so any kwargs that filter through unnoticed don't
+ # raise TypeError on construction.
+ pass
+
+ async def run(self, ctx, old_flag, new_flag):
+ """Perform initialisation work.
+
+ :param ctx: :class:`InitContext` with logger, config client,
+ flow-svc client.
+ :param old_flag: Previously-stored flag string, or ``None`` if
+ this initialiser has never successfully completed in this
+ deployment.
+ :param new_flag: Currently-configured flag. A string chosen
+ by the operator; typically something like ``"v1"``.
+
+ :raises: Any exception on failure. The bootstrapper catches,
+ logs, and re-runs on the next cycle; completion state is
+ only written on clean return.
+ """
+ raise NotImplementedError
diff --git a/trustgraph-flow/trustgraph/bootstrap/bootstrapper/__init__.py b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/__init__.py
new file mode 100644
index 00000000..98f4d9da
--- /dev/null
+++ b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/__init__.py
@@ -0,0 +1 @@
+from . service import *
diff --git a/trustgraph-flow/trustgraph/bootstrap/bootstrapper/__main__.py b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/__main__.py
new file mode 100644
index 00000000..da5a9021
--- /dev/null
+++ b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/__main__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+
+from . service import run
+
+if __name__ == '__main__':
+ run()
diff --git a/trustgraph-flow/trustgraph/bootstrap/bootstrapper/service.py b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/service.py
new file mode 100644
index 00000000..eb6238d3
--- /dev/null
+++ b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/service.py
@@ -0,0 +1,414 @@
+"""
+Bootstrapper processor.
+
+Runs a pluggable list of initialisers in a reconciliation loop.
+Each initialiser's completion state is recorded in the reserved
+``__system__`` workspace under the ``init-state`` config type.
+
+See docs/tech-specs/bootstrap.md for the full design.
+"""
+
+import asyncio
+import importlib
+import json
+import logging
+import uuid
+from argparse import ArgumentParser
+from dataclasses import dataclass
+
+from trustgraph.base import AsyncProcessor
+from trustgraph.base import ProducerMetrics, SubscriberMetrics
+from trustgraph.base.config_client import ConfigClient
+from trustgraph.base.request_response_spec import RequestResponse
+from trustgraph.schema import (
+ ConfigRequest, ConfigResponse,
+ config_request_queue, config_response_queue,
+)
+from trustgraph.schema import (
+ FlowRequest, FlowResponse,
+ flow_request_queue, flow_response_queue,
+)
+
+from .. base import Initialiser, InitContext
+
+logger = logging.getLogger(__name__)
+
+default_ident = "bootstrap"
+
+# Reserved workspace + config type under which completion state is
+# stored. Reserved (`_`-prefix) workspaces are excluded from the
+# config push broadcast — live processors never see these keys.
+SYSTEM_WORKSPACE = "__system__"
+INIT_STATE_TYPE = "init-state"
+
+# Cadence tiers.
+GATE_BACKOFF = 5 # Services not responding; retry soon.
+INIT_RETRY = 15 # Gate passed but something ran/failed;
+ # converge quickly.
+STEADY_INTERVAL = 300 # Everything at target flag; idle cheaply.
+
+
+@dataclass
+class InitialiserSpec:
+ """One entry in the bootstrapper's configured list of initialisers."""
+ name: str
+ flag: str
+ instance: Initialiser
+
+
+def _resolve_class(dotted):
+ """Import and return a class by its dotted path."""
+ module_path, _, class_name = dotted.rpartition(".")
+ if not module_path:
+ raise ValueError(
+ f"Initialiser class must be a dotted path, got {dotted!r}"
+ )
+ module = importlib.import_module(module_path)
+ return getattr(module, class_name)
+
+
+def _load_initialisers_file(path):
+ """Load the initialisers spec list from a YAML or JSON file.
+
+ File shape:
+
+ .. code-block:: yaml
+
+ initialisers:
+ - class: trustgraph.bootstrap.initialisers.PulsarTopology
+ name: pulsar-topology
+ flag: v1
+ params:
+ admin_url: http://pulsar:8080
+ tenant: tg
+ - ...
+ """
+ with open(path) as f:
+ content = f.read()
+ if path.endswith((".yaml", ".yml")):
+ import yaml
+ doc = yaml.safe_load(content)
+ else:
+ doc = json.loads(content)
+ if not isinstance(doc, dict) or "initialisers" not in doc:
+ raise RuntimeError(
+ f"{path}: expected a mapping with an 'initialisers' key"
+ )
+ return doc["initialisers"]
+
+
+class Processor(AsyncProcessor):
+
+ def __init__(self, **params):
+
+ super().__init__(**params)
+
+ # Source the initialisers list either from a direct parameter
+ # (processor-group embedding) or from a file (CLI launch).
+ inits = params.get("initialisers")
+ if inits is None:
+ inits_file = params.get("initialisers_file")
+ if inits_file is None:
+ raise RuntimeError(
+ "Bootstrapper requires either the 'initialisers' "
+ "parameter or --initialisers-file"
+ )
+ inits = _load_initialisers_file(inits_file)
+
+ self.specs = []
+ names = set()
+
+ for entry in inits:
+ if not isinstance(entry, dict):
+ raise RuntimeError(
+ f"Initialiser entry must be a mapping, got: {entry!r}"
+ )
+ for required in ("class", "name", "flag"):
+ if required not in entry:
+ raise RuntimeError(
+ f"Initialiser entry missing required field "
+ f"{required!r}: {entry!r}"
+ )
+
+ name = entry["name"]
+ if name in names:
+ raise RuntimeError(f"Duplicate initialiser name {name!r}")
+ names.add(name)
+
+ cls = _resolve_class(entry["class"])
+
+ try:
+ instance = cls(**entry.get("params", {}))
+ except Exception as e:
+ raise RuntimeError(
+ f"Failed to instantiate initialiser "
+ f"{entry['class']!r} as {name!r}: "
+ f"{type(e).__name__}: {e}"
+ )
+
+ self.specs.append(InitialiserSpec(
+ name=name,
+ flag=entry["flag"],
+ instance=instance,
+ ))
+
+ logger.info(
+ f"Bootstrapper: loaded {len(self.specs)} initialisers"
+ )
+
+ # ------------------------------------------------------------------
+ # Client construction (short-lived per wake cycle).
+ # ------------------------------------------------------------------
+
+ def _make_config_client(self):
+ rr_id = str(uuid.uuid4())
+ return ConfigClient(
+ backend=self.pubsub_backend,
+ subscription=f"{self.id}--config--{rr_id}",
+ consumer_name=self.id,
+ request_topic=config_request_queue,
+ request_schema=ConfigRequest,
+ request_metrics=ProducerMetrics(
+ processor=self.id, flow=None, name="config-request",
+ ),
+ response_topic=config_response_queue,
+ response_schema=ConfigResponse,
+ response_metrics=SubscriberMetrics(
+ processor=self.id, flow=None, name="config-response",
+ ),
+ )
+
+ def _make_flow_client(self):
+ rr_id = str(uuid.uuid4())
+ return RequestResponse(
+ backend=self.pubsub_backend,
+ subscription=f"{self.id}--flow--{rr_id}",
+ consumer_name=self.id,
+ request_topic=flow_request_queue,
+ request_schema=FlowRequest,
+ request_metrics=ProducerMetrics(
+ processor=self.id, flow=None, name="flow-request",
+ ),
+ response_topic=flow_response_queue,
+ response_schema=FlowResponse,
+ response_metrics=SubscriberMetrics(
+ processor=self.id, flow=None, name="flow-response",
+ ),
+ )
+
+ async def _open_clients(self):
+ config = self._make_config_client()
+ flow = self._make_flow_client()
+ await config.start()
+ try:
+ await flow.start()
+ except Exception:
+ await self._safe_stop(config)
+ raise
+ return config, flow
+
+ async def _safe_stop(self, client):
+ try:
+ await client.stop()
+ except Exception:
+ pass
+
+ # ------------------------------------------------------------------
+ # Service gate.
+ # ------------------------------------------------------------------
+
+ async def _gate_ready(self, config, flow):
+ try:
+ await config.keys(SYSTEM_WORKSPACE, INIT_STATE_TYPE)
+ except Exception as e:
+ logger.info(
+ f"Gate: config-svc not ready ({type(e).__name__}: {e})"
+ )
+ return False
+
+ try:
+ resp = await flow.request(
+ FlowRequest(
+ operation="list-blueprints",
+ workspace=SYSTEM_WORKSPACE,
+ ),
+ timeout=5,
+ )
+ if resp.error:
+ logger.info(
+ f"Gate: flow-svc error: "
+ f"{resp.error.type}: {resp.error.message}"
+ )
+ return False
+ except Exception as e:
+ logger.info(
+ f"Gate: flow-svc not ready ({type(e).__name__}: {e})"
+ )
+ return False
+
+ return True
+
+ # ------------------------------------------------------------------
+ # Completion state.
+ # ------------------------------------------------------------------
+
+ async def _stored_flag(self, config, name):
+ raw = await config.get(SYSTEM_WORKSPACE, INIT_STATE_TYPE, name)
+ if raw is None:
+ return None
+ try:
+ return json.loads(raw)
+ except Exception:
+ return raw
+
+ async def _store_flag(self, config, name, flag):
+ await config.put(
+ SYSTEM_WORKSPACE, INIT_STATE_TYPE, name,
+ json.dumps(flag),
+ )
+
+ # ------------------------------------------------------------------
+ # Per-spec execution.
+ # ------------------------------------------------------------------
+
+ async def _run_spec(self, spec, config, flow):
+ """Run a single initialiser spec.
+
+ Returns one of:
+ - ``"skip"``: stored flag already matches target, nothing to do.
+ - ``"ran"``: initialiser ran and completion state was updated.
+ - ``"failed"``: initialiser raised.
+ - ``"failed-state-write"``: initialiser succeeded but we could
+ not persist the new flag (transient — will re-run next cycle).
+ """
+
+ try:
+ old_flag = await self._stored_flag(config, spec.name)
+ except Exception as e:
+ logger.warning(
+ f"{spec.name}: could not read stored flag "
+ f"({type(e).__name__}: {e})"
+ )
+ return "failed"
+
+ if old_flag == spec.flag:
+ return "skip"
+
+ child_logger = logger.getChild(spec.name)
+ child_ctx = InitContext(
+ logger=child_logger,
+ config=config,
+ flow=flow,
+ )
+
+ child_logger.info(
+ f"Running (old_flag={old_flag!r} -> new_flag={spec.flag!r})"
+ )
+
+ try:
+ await spec.instance.run(child_ctx, old_flag, spec.flag)
+ except Exception as e:
+ child_logger.error(
+ f"Failed: {type(e).__name__}: {e}", exc_info=True,
+ )
+ return "failed"
+
+ try:
+ await self._store_flag(config, spec.name, spec.flag)
+ except Exception as e:
+ child_logger.warning(
+ f"Completed but could not persist state flag "
+ f"({type(e).__name__}: {e}); will re-run next cycle"
+ )
+ return "failed-state-write"
+
+ child_logger.info(f"Completed (flag={spec.flag!r})")
+ return "ran"
+
+ # ------------------------------------------------------------------
+ # Main loop.
+ # ------------------------------------------------------------------
+
+ async def run(self):
+
+ logger.info(
+ f"Bootstrapper starting with {len(self.specs)} initialisers"
+ )
+
+ while self.running:
+
+ sleep_for = STEADY_INTERVAL
+
+ try:
+ config, flow = await self._open_clients()
+ except Exception as e:
+ logger.info(
+ f"Failed to open clients "
+ f"({type(e).__name__}: {e}); retry in {GATE_BACKOFF}s"
+ )
+ await asyncio.sleep(GATE_BACKOFF)
+ continue
+
+ try:
+ # Phase 1: pre-service initialisers run unconditionally.
+ pre_specs = [
+ s for s in self.specs
+ if not s.instance.wait_for_services
+ ]
+ pre_results = {}
+ for spec in pre_specs:
+ pre_results[spec.name] = await self._run_spec(
+ spec, config, flow,
+ )
+
+ # Phase 2: gate.
+ gate_ok = await self._gate_ready(config, flow)
+
+ # Phase 3: post-service initialisers, if gate passed.
+ post_results = {}
+ if gate_ok:
+ post_specs = [
+ s for s in self.specs
+ if s.instance.wait_for_services
+ ]
+ for spec in post_specs:
+ post_results[spec.name] = await self._run_spec(
+ spec, config, flow,
+ )
+
+ # Cadence selection.
+ if not gate_ok:
+ sleep_for = GATE_BACKOFF
+ else:
+ all_results = {**pre_results, **post_results}
+ if any(r != "skip" for r in all_results.values()):
+ sleep_for = INIT_RETRY
+ else:
+ sleep_for = STEADY_INTERVAL
+
+ finally:
+ await self._safe_stop(config)
+ await self._safe_stop(flow)
+
+ await asyncio.sleep(sleep_for)
+
+ # ------------------------------------------------------------------
+ # CLI arg plumbing.
+ # ------------------------------------------------------------------
+
+ @staticmethod
+ def add_args(parser: ArgumentParser) -> None:
+
+ AsyncProcessor.add_args(parser)
+
+ parser.add_argument(
+ '-c', '--initialisers-file',
+ help='Path to YAML or JSON file describing the '
+ 'initialisers to run. Ignored when the '
+ "'initialisers' parameter is provided directly "
+ '(e.g. when running inside a processor group).',
+ )
+
+
+def run():
+ Processor.launch(default_ident, __doc__)
diff --git a/trustgraph-flow/trustgraph/bootstrap/initialisers/__init__.py b/trustgraph-flow/trustgraph/bootstrap/initialisers/__init__.py
new file mode 100644
index 00000000..6171eb02
--- /dev/null
+++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/__init__.py
@@ -0,0 +1,20 @@
+"""
+Core bootstrap initialisers.
+
+These cover the base TrustGraph deployment case. Enterprise or
+third-party initialisers live in their own packages and are
+referenced in the bootstrapper's config by fully-qualified dotted
+path.
+"""
+
+from . pulsar_topology import PulsarTopology
+from . template_seed import TemplateSeed
+from . workspace_init import WorkspaceInit
+from . default_flow_start import DefaultFlowStart
+
+__all__ = [
+ "PulsarTopology",
+ "TemplateSeed",
+ "WorkspaceInit",
+ "DefaultFlowStart",
+]
diff --git a/trustgraph-flow/trustgraph/bootstrap/initialisers/default_flow_start.py b/trustgraph-flow/trustgraph/bootstrap/initialisers/default_flow_start.py
new file mode 100644
index 00000000..7e7f96bd
--- /dev/null
+++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/default_flow_start.py
@@ -0,0 +1,101 @@
+"""
+DefaultFlowStart initialiser — starts a named flow in a workspace
+using a specified blueprint.
+
+Separated from WorkspaceInit so deployments that want a workspace
+without an auto-started flow can simply omit this initialiser.
+
+Parameters
+----------
+workspace : str (default "default")
+ Workspace in which to start the flow.
+flow_id : str (default "default")
+ Identifier for the started flow.
+blueprint : str (required)
+ Blueprint name (must already exist in the workspace's config,
+ typically via TemplateSeed -> WorkspaceInit).
+description : str (default "Default")
+ Human-readable description passed to flow-svc.
+parameters : dict (optional)
+ Optional parameter overrides passed to start-flow.
+"""
+
+from trustgraph.schema import FlowRequest
+
+from .. base import Initialiser
+
+
+class DefaultFlowStart(Initialiser):
+
+ def __init__(
+ self,
+ workspace="default",
+ flow_id="default",
+ blueprint=None,
+ description="Default",
+ parameters=None,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ if not blueprint:
+ raise ValueError(
+ "DefaultFlowStart requires 'blueprint'"
+ )
+ self.workspace = workspace
+ self.flow_id = flow_id
+ self.blueprint = blueprint
+ self.description = description
+ self.parameters = dict(parameters) if parameters else {}
+
+ async def run(self, ctx, old_flag, new_flag):
+
+ # Check whether the flow already exists. Belt-and-braces
+ # beyond the flag gate: if an operator stops and restarts the
+ # bootstrapper after the flow is already running, we don't
+ # want to blindly try to start it again.
+ list_resp = await ctx.flow.request(
+ FlowRequest(
+ operation="list-flows",
+ workspace=self.workspace,
+ ),
+ timeout=10,
+ )
+ if list_resp.error:
+ raise RuntimeError(
+ f"list-flows failed: "
+ f"{list_resp.error.type}: {list_resp.error.message}"
+ )
+
+ if self.flow_id in (list_resp.flow_ids or []):
+ ctx.logger.info(
+ f"Flow {self.flow_id!r} already running in workspace "
+ f"{self.workspace!r}; nothing to do"
+ )
+ return
+
+ ctx.logger.info(
+ f"Starting flow {self.flow_id!r} "
+ f"(blueprint={self.blueprint!r}) "
+ f"in workspace {self.workspace!r}"
+ )
+
+ resp = await ctx.flow.request(
+ FlowRequest(
+ operation="start-flow",
+ workspace=self.workspace,
+ flow_id=self.flow_id,
+ blueprint_name=self.blueprint,
+ description=self.description,
+ parameters=self.parameters,
+ ),
+ timeout=30,
+ )
+ if resp.error:
+ raise RuntimeError(
+ f"start-flow failed: "
+ f"{resp.error.type}: {resp.error.message}"
+ )
+
+ ctx.logger.info(
+ f"Flow {self.flow_id!r} started"
+ )
diff --git a/trustgraph-flow/trustgraph/bootstrap/initialisers/pulsar_topology.py b/trustgraph-flow/trustgraph/bootstrap/initialisers/pulsar_topology.py
new file mode 100644
index 00000000..843fe056
--- /dev/null
+++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/pulsar_topology.py
@@ -0,0 +1,131 @@
+"""
+PulsarTopology initialiser — creates Pulsar tenant and namespaces
+with their retention policies.
+
+Runs pre-gate (``wait_for_services = False``) because config-svc and
+flow-svc can't connect to Pulsar until these namespaces exist.
+Admin-API calls are idempotent so re-runs on flag change are safe.
+"""
+
+import asyncio
+import requests
+
+from .. base import Initialiser
+
+# Namespace configs. flow/request take broker defaults. response
+# and notify get aggressive retention — those classes carry short-lived
+# request/response and notification traffic only.
+NAMESPACE_CONFIG = {
+ "flow": {},
+ "request": {},
+ "response": {
+ "retention_policies": {
+ "retentionSizeInMB": -1,
+ "retentionTimeInMinutes": 3,
+ "subscriptionExpirationTimeMinutes": 30,
+ },
+ },
+ "notify": {
+ "retention_policies": {
+ "retentionSizeInMB": -1,
+ "retentionTimeInMinutes": 3,
+ "subscriptionExpirationTimeMinutes": 5,
+ },
+ },
+}
+
+REQUEST_TIMEOUT = 10
+
+
+class PulsarTopology(Initialiser):
+
+ wait_for_services = False
+
+ def __init__(
+ self,
+ admin_url="http://pulsar:8080",
+ tenant="tg",
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.admin_url = admin_url.rstrip("/")
+ self.tenant = tenant
+
+ async def run(self, ctx, old_flag, new_flag):
+ # requests is blocking; offload to executor so the loop stays
+ # responsive.
+ loop = asyncio.get_event_loop()
+ await loop.run_in_executor(None, self._reconcile_sync, ctx.logger)
+
+ # ------------------------------------------------------------------
+ # Sync admin-API calls.
+ # ------------------------------------------------------------------
+
+ def _get_clusters(self):
+ resp = requests.get(
+ f"{self.admin_url}/admin/v2/clusters",
+ timeout=REQUEST_TIMEOUT,
+ )
+ resp.raise_for_status()
+ return resp.json()
+
+ def _tenant_exists(self):
+ resp = requests.get(
+ f"{self.admin_url}/admin/v2/tenants/{self.tenant}",
+ timeout=REQUEST_TIMEOUT,
+ )
+ return resp.status_code == 200
+
+ def _create_tenant(self, clusters):
+ resp = requests.put(
+ f"{self.admin_url}/admin/v2/tenants/{self.tenant}",
+ json={"adminRoles": [], "allowedClusters": clusters},
+ timeout=REQUEST_TIMEOUT,
+ )
+ if resp.status_code != 204:
+ raise RuntimeError(
+ f"Tenant {self.tenant!r} create failed: "
+ f"{resp.status_code} {resp.text}"
+ )
+
+ def _namespace_exists(self, namespace):
+ resp = requests.get(
+ f"{self.admin_url}/admin/v2/namespaces/"
+ f"{self.tenant}/{namespace}",
+ timeout=REQUEST_TIMEOUT,
+ )
+ return resp.status_code == 200
+
+ def _create_namespace(self, namespace, config):
+ resp = requests.put(
+ f"{self.admin_url}/admin/v2/namespaces/"
+ f"{self.tenant}/{namespace}",
+ json=config,
+ timeout=REQUEST_TIMEOUT,
+ )
+ if resp.status_code != 204:
+ raise RuntimeError(
+ f"Namespace {self.tenant}/{namespace} create failed: "
+ f"{resp.status_code} {resp.text}"
+ )
+
+ def _reconcile_sync(self, logger):
+ if not self._tenant_exists():
+ clusters = self._get_clusters()
+ logger.info(
+ f"Creating tenant {self.tenant!r} with clusters {clusters}"
+ )
+ self._create_tenant(clusters)
+ else:
+ logger.debug(f"Tenant {self.tenant!r} already exists")
+
+ for namespace, config in NAMESPACE_CONFIG.items():
+ if self._namespace_exists(namespace):
+ logger.debug(
+ f"Namespace {self.tenant}/{namespace} already exists"
+ )
+ continue
+ logger.info(
+ f"Creating namespace {self.tenant}/{namespace}"
+ )
+ self._create_namespace(namespace, config)
diff --git a/trustgraph-flow/trustgraph/bootstrap/initialisers/template_seed.py b/trustgraph-flow/trustgraph/bootstrap/initialisers/template_seed.py
new file mode 100644
index 00000000..5f1e4c19
--- /dev/null
+++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/template_seed.py
@@ -0,0 +1,93 @@
+"""
+TemplateSeed initialiser — populates the reserved ``__template__``
+workspace from an external JSON seed file.
+
+Seed file shape:
+
+.. code-block:: json
+
+ {
+ "flow-blueprint": {
+ "ontology": { ... },
+ "agent": { ... }
+ },
+ "prompt": {
+ ...
+ },
+ ...
+ }
+
+Top-level keys are config types; nested keys are config entries.
+Values are arbitrary JSON (they'll be ``json.dumps()``'d on write).
+
+Parameters
+----------
+config_file : str
+ Path to the seed file on disk.
+overwrite : bool (default False)
+ On re-run (flag change), if True overwrite all keys; if False
+ upsert-missing-only (preserves any operator customisation of
+ the template).
+"""
+
+import json
+
+from .. base import Initialiser
+
+TEMPLATE_WORKSPACE = "__template__"
+
+
+class TemplateSeed(Initialiser):
+
+ def __init__(self, config_file, overwrite=False, **kwargs):
+ super().__init__(**kwargs)
+ if not config_file:
+ raise ValueError("TemplateSeed requires 'config_file'")
+ self.config_file = config_file
+ self.overwrite = overwrite
+
+ async def run(self, ctx, old_flag, new_flag):
+
+ with open(self.config_file) as f:
+ seed = json.load(f)
+
+ if old_flag is None:
+ # Clean first run — write every entry.
+ await self._write_all(ctx, seed)
+ return
+
+ # Re-run after flag change.
+ if self.overwrite:
+ await self._write_all(ctx, seed)
+ else:
+ await self._upsert_missing(ctx, seed)
+
+ async def _write_all(self, ctx, seed):
+ values = []
+ for type_name, entries in seed.items():
+ for key, value in entries.items():
+ values.append((type_name, key, json.dumps(value)))
+ if values:
+ await ctx.config.put_many(TEMPLATE_WORKSPACE, values)
+ ctx.logger.info(
+ f"Template seeded with {len(values)} entries"
+ )
+
+ async def _upsert_missing(self, ctx, seed):
+ written = 0
+ for type_name, entries in seed.items():
+ existing = set(
+ await ctx.config.keys(TEMPLATE_WORKSPACE, type_name)
+ )
+ values = []
+ for key, value in entries.items():
+ if key not in existing:
+ values.append(
+ (type_name, key, json.dumps(value))
+ )
+ if values:
+ await ctx.config.put_many(TEMPLATE_WORKSPACE, values)
+ written += len(values)
+ ctx.logger.info(
+ f"Template upsert-missing: {written} new entries"
+ )
diff --git a/trustgraph-flow/trustgraph/bootstrap/initialisers/workspace_init.py b/trustgraph-flow/trustgraph/bootstrap/initialisers/workspace_init.py
new file mode 100644
index 00000000..10aefe9d
--- /dev/null
+++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/workspace_init.py
@@ -0,0 +1,138 @@
+"""
+WorkspaceInit initialiser — creates a workspace and populates it from
+either the ``__template__`` workspace or a seed file on disk.
+
+Parameters
+----------
+workspace : str
+ Target workspace to create / populate.
+source : str
+ Either ``"template"`` (copy the full contents of the
+ ``__template__`` workspace) or ``"seed-file"`` (read from
+ ``seed_file``).
+seed_file : str (required when source=="seed-file")
+ Path to a JSON seed file with the same shape TemplateSeed consumes.
+overwrite : bool (default False)
+ On re-run (flag change), if True overwrite all keys; if False,
+ upsert-missing-only (preserves in-workspace customisations).
+
+Raises (in ``run``)
+-------------------
+When source is ``"template"``, raises ``RuntimeError`` if the
+``__template__`` workspace is empty — indicating that TemplateSeed
+hasn't run yet. The bootstrapper's retry loop will re-attempt on
+the next cycle once the prerequisite is satisfied.
+"""
+
+import json
+
+from .. base import Initialiser
+
+TEMPLATE_WORKSPACE = "__template__"
+
+
+class WorkspaceInit(Initialiser):
+
+ def __init__(
+ self,
+ workspace="default",
+ source="template",
+ seed_file=None,
+ overwrite=False,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ if source not in ("template", "seed-file"):
+ raise ValueError(
+ f"WorkspaceInit: source must be 'template' or "
+ f"'seed-file', got {source!r}"
+ )
+ if source == "seed-file" and not seed_file:
+ raise ValueError(
+ "WorkspaceInit: seed_file required when source='seed-file'"
+ )
+
+ self.workspace = workspace
+ self.source = source
+ self.seed_file = seed_file
+ self.overwrite = overwrite
+
+ async def run(self, ctx, old_flag, new_flag):
+ if self.source == "seed-file":
+ tree = self._load_seed_file()
+ else:
+ tree = await self._load_from_template(ctx)
+
+ if old_flag is None or self.overwrite:
+ await self._write_all(ctx, tree)
+ else:
+ await self._upsert_missing(ctx, tree)
+
+ def _load_seed_file(self):
+ with open(self.seed_file) as f:
+ return json.load(f)
+
+ async def _load_from_template(self, ctx):
+ """Build a seed tree from the entire ``__template__`` workspace.
+ Raises if the workspace is empty, so the bootstrapper knows
+ the prerequisite isn't met yet."""
+
+ raw_tree = await ctx.config.get_all(TEMPLATE_WORKSPACE)
+
+ tree = {}
+ total = 0
+ for type_name, entries in raw_tree.items():
+ parsed = {}
+ for key, raw in entries.items():
+ if raw is None:
+ continue
+ try:
+ parsed[key] = json.loads(raw)
+ except Exception:
+ parsed[key] = raw
+ total += 1
+ if parsed:
+ tree[type_name] = parsed
+
+ if total == 0:
+ raise RuntimeError(
+ "Template workspace is empty — has TemplateSeed run yet?"
+ )
+
+ ctx.logger.debug(
+ f"Loaded {total} template entries across {len(tree)} types"
+ )
+ return tree
+
+ async def _write_all(self, ctx, tree):
+ values = []
+ for type_name, entries in tree.items():
+ for key, value in entries.items():
+ values.append((type_name, key, json.dumps(value)))
+ if values:
+ await ctx.config.put_many(self.workspace, values)
+ ctx.logger.info(
+ f"Workspace {self.workspace!r} populated with "
+ f"{len(values)} entries"
+ )
+
+ async def _upsert_missing(self, ctx, tree):
+ written = 0
+ for type_name, entries in tree.items():
+ existing = set(
+ await ctx.config.keys(self.workspace, type_name)
+ )
+ values = []
+ for key, value in entries.items():
+ if key not in existing:
+ values.append(
+ (type_name, key, json.dumps(value))
+ )
+ if values:
+ await ctx.config.put_many(self.workspace, values)
+ written += len(values)
+ ctx.logger.info(
+ f"Workspace {self.workspace!r} upsert-missing: "
+ f"{written} new entries"
+ )
diff --git a/trustgraph-flow/trustgraph/chunking/recursive/chunker.py b/trustgraph-flow/trustgraph/chunking/recursive/chunker.py
index 098e6111..a0052c79 100755
--- a/trustgraph-flow/trustgraph/chunking/recursive/chunker.py
+++ b/trustgraph-flow/trustgraph/chunking/recursive/chunker.py
@@ -58,7 +58,7 @@ class Processor(ChunkingService):
2500, 4000, 6400, 10000, 16000]
)
- self.text_splitter = self.RecursiveCharacterTextSplitter(
+ self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
@@ -111,7 +111,7 @@ class Processor(ChunkingService):
chunk_overlap = int(chunk_overlap)
# Create text splitter with effective parameters
- text_splitter = self.RecursiveCharacterTextSplitter(
+ text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
diff --git a/trustgraph-flow/trustgraph/chunking/token/chunker.py b/trustgraph-flow/trustgraph/chunking/token/chunker.py
index 3bf907a4..c3935e4b 100755
--- a/trustgraph-flow/trustgraph/chunking/token/chunker.py
+++ b/trustgraph-flow/trustgraph/chunking/token/chunker.py
@@ -56,7 +56,7 @@ class Processor(ChunkingService):
2500, 4000, 6400, 10000, 16000]
)
- self.text_splitter = self.TokenTextSplitter(
+ self.text_splitter = TokenTextSplitter(
encoding_name="cl100k_base",
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
@@ -108,7 +108,7 @@ class Processor(ChunkingService):
chunk_overlap = int(chunk_overlap)
# Create text splitter with effective parameters
- text_splitter = self.TokenTextSplitter(
+ text_splitter = TokenTextSplitter(
encoding_name="cl100k_base",
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
diff --git a/trustgraph-flow/trustgraph/config/service/service.py b/trustgraph-flow/trustgraph/config/service/service.py
index 56a54ee0..058f4e4b 100644
--- a/trustgraph-flow/trustgraph/config/service/service.py
+++ b/trustgraph-flow/trustgraph/config/service/service.py
@@ -24,6 +24,21 @@ logger = logging.getLogger(__name__)
default_ident = "config-svc"
+
+def is_reserved_workspace(workspace):
+ """Reserved workspaces are storage-only.
+
+ Any workspace id beginning with ``_`` is reserved for internal use
+ (e.g. ``__template__`` holding factory-default seed config).
+ Reads and writes work normally so bootstrap and provisioning code
+ can use the standard config API, but **change notifications for
+ reserved workspaces are suppressed**. Services subscribed to the
+ config push therefore never see reserved-workspace events and
+ cannot accidentally act on template content as if it were live
+ state.
+ """
+ return workspace.startswith("_")
+
default_config_request_queue = config_request_queue
default_config_response_queue = config_response_queue
default_config_push_queue = config_push_queue
@@ -130,6 +145,21 @@ class Processor(AsyncProcessor):
async def push(self, changes=None):
+ # Suppress notifications from reserved workspaces (ids starting
+ # with "_", e.g. "__template__"). Stored config is preserved;
+ # only the broadcast is filtered. Keeps services oblivious to
+ # template / bootstrap state.
+ if changes:
+ filtered = {}
+ for type_name, workspaces in changes.items():
+ visible = [
+ w for w in workspaces
+ if not is_reserved_workspace(w)
+ ]
+ if visible:
+ filtered[type_name] = visible
+ changes = filtered
+
version = await self.config.get_version()
resp = ConfigPush(
diff --git a/trustgraph-flow/trustgraph/embeddings/ollama/processor.py b/trustgraph-flow/trustgraph/embeddings/ollama/processor.py
index c63db33c..5fa74054 100755
--- a/trustgraph-flow/trustgraph/embeddings/ollama/processor.py
+++ b/trustgraph-flow/trustgraph/embeddings/ollama/processor.py
@@ -5,7 +5,7 @@ Input is text, output is embeddings vector.
"""
from ... base import EmbeddingsService
-from ollama import Client
+from ollama import AsyncClient
import os
import logging
@@ -30,24 +30,24 @@ class Processor(EmbeddingsService):
}
)
- self.client = Client(host=ollama)
+ self.client = AsyncClient(host=ollama)
self.default_model = model
self._checked_models = set()
- def _ensure_model(self, model_name):
+ async def _ensure_model(self, model_name):
"""Check if model exists locally, pull it if not."""
if model_name in self._checked_models:
return
try:
- self.client.show(model_name)
+ await self.client.show(model_name)
self._checked_models.add(model_name)
except Exception as e:
status_code = getattr(e, 'status_code', None)
if status_code == 404 or "not found" in str(e).lower():
logger.info(f"Ollama model '{model_name}' not found locally. Pulling, this may take a while...")
try:
- self.client.pull(model_name)
+ await self.client.pull(model_name)
self._checked_models.add(model_name)
logger.info(f"Successfully pulled Ollama model '{model_name}'.")
except Exception as pull_e:
@@ -63,10 +63,10 @@ class Processor(EmbeddingsService):
use_model = model or self.default_model
# Ensure the model exists/is pulled
- self._ensure_model(use_model)
+ await self._ensure_model(use_model)
# Ollama handles batch input efficiently
- embeds = self.client.embed(
+ embeds = await self.client.embed(
model = use_model,
input = texts
)
diff --git a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py
index ef9a7331..1d45d3f9 100644
--- a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py
@@ -540,6 +540,32 @@ class Processor(FlowProcessor):
return True
return False
+ def _is_subclass_of(self, cls, target, ontology_subset, max_depth=100):
+ """Return True if cls is a subclass of target via subclass_of chain.
+
+ Defends against cycles in ontology data (LLM-generated ontologies may
+ emit A subclass_of B, B subclass_of A) with a visited set. A depth cap
+ acts as a second line of defense against unbounded chains.
+ """
+ if cls == target:
+ return True
+ visited = set()
+ curr = cls
+ depth = 0
+ while curr in ontology_subset.classes and depth < max_depth:
+ if curr in visited:
+ return False # cycle detected
+ visited.add(curr)
+ cls_def = ontology_subset.classes[curr]
+ parent = cls_def.get('subclass_of') if isinstance(cls_def, dict) else None
+ if parent is None:
+ return False
+ if parent == target:
+ return True
+ curr = parent
+ depth += 1
+ return False
+
def is_valid_triple(self, subject: str, predicate: str, object_val: str,
ontology_subset: OntologySubset, entity_types: dict = None) -> bool:
"""Validate triple against ontology constraints."""
@@ -570,36 +596,20 @@ class Processor(FlowProcessor):
expected_domain = prop_def.get('domain')
if expected_domain and subject in entity_types:
actual_domain = entity_types[subject]
- if actual_domain != expected_domain:
- is_subclass = False
- curr_class = actual_domain
- while curr_class in ontology_subset.classes:
- cls_def = ontology_subset.classes[curr_class]
- parent = cls_def.get('subclass_of') if isinstance(cls_def, dict) else None
- if parent == expected_domain:
- is_subclass = True
- break
- curr_class = parent
- if not is_subclass:
- return False
+ if actual_domain != expected_domain and not self._is_subclass_of(
+ actual_domain, expected_domain, ontology_subset
+ ):
+ return False
# Range validation
if is_obj_prop:
expected_range = prop_def.get('range')
if expected_range and object_val in entity_types:
actual_range = entity_types[object_val]
- if actual_range != expected_range:
- is_subclass = False
- curr_class = actual_range
- while curr_class in ontology_subset.classes:
- cls_def = ontology_subset.classes[curr_class]
- parent = cls_def.get('subclass_of') if isinstance(cls_def, dict) else None
- if parent == expected_range:
- is_subclass = True
- break
- curr_class = parent
- if not is_subclass:
- return False
+ if actual_range != expected_range and not self._is_subclass_of(
+ actual_range, expected_range, ontology_subset
+ ):
+ return False
return True
@@ -988,4 +998,4 @@ class Processor(FlowProcessor):
def run():
"""Launch the OntoRAG extraction service."""
- Processor.launch(default_ident, __doc__)
\ No newline at end of file
+ Processor.launch(default_ident, __doc__)
diff --git a/trustgraph-flow/trustgraph/gateway/auth.py b/trustgraph-flow/trustgraph/gateway/auth.py
index a693ca32..6abcbe15 100644
--- a/trustgraph-flow/trustgraph/gateway/auth.py
+++ b/trustgraph-flow/trustgraph/gateway/auth.py
@@ -1,22 +1,371 @@
+"""
+IAM-backed authentication and authorisation for the API gateway.
-class Authenticator:
+The gateway delegates both authentication ("who is this caller?")
+and authorisation ("may they do this?") to the IAM regime via the
+contract specified in docs/tech-specs/iam-contract.md. No regime-
+specific policy (roles, scopes, claims) lives in the gateway.
- def __init__(self, token=None, allow_all=False):
+- Authentication: API keys are resolved by IAM; JWTs are validated
+ locally against the cached signing public key.
+- Authorisation: every per-request decision is asked of IAM via
+ ``authorise(identity, capability, resource, parameters)``, with
+ results cached for the TTL the regime returns.
+"""
- if not allow_all and token is None:
- raise RuntimeError("Need a token")
+import asyncio
+import base64
+import hashlib
+import json
+import logging
+import time
+import uuid
+from dataclasses import dataclass, field
- if not allow_all and token == "":
- raise RuntimeError("Need a token")
+from aiohttp import web
- self.token = token
- self.allow_all = allow_all
+from cryptography.hazmat.primitives import serialization
+from cryptography.hazmat.primitives.asymmetric import ed25519
- def permitted(self, token, roles):
+from ..base.iam_client import IamClient
+from ..base.metrics import ProducerMetrics, SubscriberMetrics
+from ..schema import (
+ IamRequest, IamResponse,
+ iam_request_queue, iam_response_queue,
+)
- if self.allow_all: return True
+logger = logging.getLogger("auth")
- if self.token != token: return False
+API_KEY_CACHE_TTL = 60 # seconds
- return True
+# Upper bound on cache TTL the gateway honours for an authorisation
+# decision, regardless of what the regime suggested. Caps the
+# revocation latency window.
+AUTHZ_CACHE_TTL_MAX = 60 # seconds
+
+@dataclass
+class Identity:
+ """The gateway-side surface of an authenticated caller.
+
+ Per the IAM contract this is a small fixed shape; regime-internal
+ state (roles, claims, group memberships) is reachable only via
+ the regime's ``authorise`` operation. The gateway itself never
+ reads policy from this object.
+ """
+ # Opaque handle, quoted back when calling ``authorise``. For
+ # the OSS regime this is the user record's id; the gateway
+ # treats it as a string with no semantic content.
+ handle: str
+ # The workspace this credential authenticates to. Used by the
+ # gateway as the default-fill-in for operations that omit a
+ # workspace. Never used as policy input.
+ workspace: str
+ # Stable identifier for audit logs. In OSS this is the same
+ # value as ``handle``; not assumed equal in the contract.
+ principal_id: str
+ # How the credential was presented. Non-policy; useful for
+ # logs / metrics only.
+ source: str # "api-key" | "jwt"
+
+
+def _auth_failure():
+ return web.HTTPUnauthorized(
+ text='{"error":"auth failure"}',
+ content_type="application/json",
+ )
+
+
+def _access_denied():
+ return web.HTTPForbidden(
+ text='{"error":"access denied"}',
+ content_type="application/json",
+ )
+
+
+def _b64url_decode(s):
+ pad = "=" * (-len(s) % 4)
+ return base64.urlsafe_b64decode(s + pad)
+
+
+def _verify_jwt_eddsa(token, public_pem):
+ """Verify an Ed25519 JWT and return its claims. Raises on any
+ validation failure. Refuses non-EdDSA algorithms."""
+ parts = token.split(".")
+ if len(parts) != 3:
+ raise ValueError("malformed JWT")
+ h_b64, p_b64, s_b64 = parts
+ signing_input = f"{h_b64}.{p_b64}".encode("ascii")
+ header = json.loads(_b64url_decode(h_b64))
+ if header.get("alg") != "EdDSA":
+ raise ValueError(f"unsupported alg: {header.get('alg')!r}")
+
+ key = serialization.load_pem_public_key(public_pem.encode("ascii"))
+ if not isinstance(key, ed25519.Ed25519PublicKey):
+ raise ValueError("public key is not Ed25519")
+
+ signature = _b64url_decode(s_b64)
+ key.verify(signature, signing_input) # raises InvalidSignature
+
+ claims = json.loads(_b64url_decode(p_b64))
+ exp = claims.get("exp")
+ if exp is None or exp < time.time():
+ raise ValueError("expired")
+ return claims
+
+
+class IamAuth:
+ """Resolves bearer credentials via the IAM service.
+
+ Used by every gateway endpoint that needs authentication. Fetches
+ the IAM signing public key at startup (cached in memory). API
+ keys are resolved via the IAM service with a local hash→identity
+ cache (short TTL so revoked keys stop working within the TTL
+ window without any push mechanism)."""
+
+ def __init__(self, backend, id="api-gateway"):
+ self.backend = backend
+ self.id = id
+
+ # Populated at start() via IAM.
+ self._signing_public_pem = None
+
+ # API-key cache: plaintext_sha256_hex -> (Identity, expires_ts)
+ self._key_cache = {}
+ self._key_cache_lock = asyncio.Lock()
+
+ # Authorisation decision cache: hash(handle, capability,
+ # resource, parameters) -> (allow_bool, expires_ts). Holds
+ # both allows and denies — denies cached briefly to avoid
+ # hammering iam-svc with repeated rejected attempts.
+ self._authz_cache: dict[str, tuple[bool, float]] = {}
+ self._authz_cache_lock = asyncio.Lock()
+
+ # ------------------------------------------------------------------
+ # Short-lived client helper. Mirrors the pattern used by the
+ # bootstrap framework and AsyncProcessor: a fresh uuid suffix per
+ # invocation so Pulsar exclusive subscriptions don't collide with
+ # ghosts from prior calls.
+ # ------------------------------------------------------------------
+
+ def _make_client(self):
+ rr_id = str(uuid.uuid4())
+ return IamClient(
+ backend=self.backend,
+ subscription=f"{self.id}--iam--{rr_id}",
+ consumer_name=self.id,
+ request_topic=iam_request_queue,
+ request_schema=IamRequest,
+ request_metrics=ProducerMetrics(
+ processor=self.id, flow=None, name="iam-request",
+ ),
+ response_topic=iam_response_queue,
+ response_schema=IamResponse,
+ response_metrics=SubscriberMetrics(
+ processor=self.id, flow=None, name="iam-response",
+ ),
+ )
+
+ async def _with_client(self, op):
+ """Open a short-lived IamClient, run ``op(client)``, close."""
+ client = self._make_client()
+ await client.start()
+ try:
+ return await op(client)
+ finally:
+ try:
+ await client.stop()
+ except Exception:
+ pass
+
+ # ------------------------------------------------------------------
+ # Lifecycle
+ # ------------------------------------------------------------------
+
+ async def start(self, max_retries=30, retry_delay=2.0):
+ """Fetch the signing public key from IAM. Retries on
+ failure — the gateway may be starting before IAM is ready."""
+
+ async def _fetch(client):
+ return await client.get_signing_key_public()
+
+ for attempt in range(max_retries):
+ try:
+ pem = await self._with_client(_fetch)
+ if pem:
+ self._signing_public_pem = pem
+ logger.info(
+ "IamAuth: fetched IAM signing public key "
+ f"({len(pem)} bytes)"
+ )
+ return
+ except Exception as e:
+ logger.info(
+ f"IamAuth: waiting for IAM signing key "
+ f"({type(e).__name__}: {e}); "
+ f"retry {attempt + 1}/{max_retries}"
+ )
+ await asyncio.sleep(retry_delay)
+
+ # Don't prevent startup forever. A later authenticate() call
+ # will try again via the JWT path.
+ logger.warning(
+ "IamAuth: could not fetch IAM signing key at startup; "
+ "JWT validation will fail until it's available"
+ )
+
+ # ------------------------------------------------------------------
+ # Authentication
+ # ------------------------------------------------------------------
+
+ async def authenticate(self, request):
+ """Extract and validate the Bearer credential from an HTTP
+ request. Returns an ``Identity``. Raises HTTPUnauthorized
+ (401 / "auth failure") on any failure mode — the caller
+ cannot distinguish missing / malformed / invalid / expired /
+ revoked credentials."""
+
+ header = request.headers.get("Authorization", "")
+ if not header.startswith("Bearer "):
+ raise _auth_failure()
+ token = header[len("Bearer "):].strip()
+ if not token:
+ raise _auth_failure()
+
+ # API keys always start with "tg_". JWTs have two dots and
+ # no "tg_" prefix. Discriminate cheaply.
+ if token.startswith("tg_"):
+ return await self._resolve_api_key(token)
+ if token.count(".") == 2:
+ return self._verify_jwt(token)
+ raise _auth_failure()
+
+ def _verify_jwt(self, token):
+ if not self._signing_public_pem:
+ raise _auth_failure()
+ try:
+ claims = _verify_jwt_eddsa(token, self._signing_public_pem)
+ except Exception as e:
+ logger.debug(f"JWT validation failed: {type(e).__name__}: {e}")
+ raise _auth_failure()
+
+ sub = claims.get("sub", "")
+ ws = claims.get("workspace", "")
+ if not sub or not ws:
+ raise _auth_failure()
+
+ # JWT carries no policy state under the IAM contract;
+ # any roles / claims field is ignored here.
+ return Identity(
+ handle=sub, workspace=ws, principal_id=sub, source="jwt",
+ )
+
+ async def _resolve_api_key(self, plaintext):
+ h = hashlib.sha256(plaintext.encode("utf-8")).hexdigest()
+
+ cached = self._key_cache.get(h)
+ now = time.time()
+ if cached and cached[1] > now:
+ return cached[0]
+
+ async with self._key_cache_lock:
+ cached = self._key_cache.get(h)
+ if cached and cached[1] > now:
+ return cached[0]
+
+ try:
+ async def _call(client):
+ return await client.resolve_api_key(plaintext)
+ # ``roles`` is returned by the OSS regime as a hint
+ # but is not consulted by the gateway; all policy
+ # decisions go through ``authorise``.
+ user_id, workspace, _roles = await self._with_client(_call)
+ except Exception as e:
+ logger.debug(
+ f"API key resolution failed: "
+ f"{type(e).__name__}: {e}"
+ )
+ raise _auth_failure()
+
+ if not user_id or not workspace:
+ raise _auth_failure()
+
+ identity = Identity(
+ handle=user_id, workspace=workspace,
+ principal_id=user_id, source="api-key",
+ )
+ self._key_cache[h] = (identity, now + API_KEY_CACHE_TTL)
+ return identity
+
+ # ------------------------------------------------------------------
+ # Authorisation
+ # ------------------------------------------------------------------
+
+ @staticmethod
+ def _authz_cache_key(handle, capability, resource, parameters):
+ payload = json.dumps(
+ {
+ "h": handle,
+ "c": capability,
+ "r": resource or {},
+ "p": parameters or {},
+ },
+ sort_keys=True,
+ separators=(",", ":"),
+ )
+ return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+
+ async def authorise(self, identity, capability, resource, parameters):
+ """Ask the IAM regime whether ``identity`` may perform
+ ``capability`` on ``resource`` given ``parameters``.
+
+ Caches the decision for the regime's suggested TTL, clamped
+ above by ``AUTHZ_CACHE_TTL_MAX``. Both allow and deny
+ decisions are cached (denies briefly, to avoid hammering
+ iam-svc with repeated rejected attempts).
+
+ Raises ``HTTPForbidden`` (403 / "access denied") on a deny
+ decision. Raises ``HTTPUnauthorized`` (401 / "auth failure")
+ if the IAM service errors out — failing closed."""
+
+ key = self._authz_cache_key(
+ identity.handle, capability, resource, parameters,
+ )
+ now = time.time()
+
+ cached = self._authz_cache.get(key)
+ if cached and cached[1] > now:
+ allow, _ = cached
+ if not allow:
+ raise _access_denied()
+ return
+
+ async with self._authz_cache_lock:
+ cached = self._authz_cache.get(key)
+ if cached and cached[1] > now:
+ allow, _ = cached
+ if not allow:
+ raise _access_denied()
+ return
+
+ try:
+ async def _call(client):
+ return await client.authorise(
+ identity.handle, capability,
+ resource or {}, parameters or {},
+ )
+ allow, ttl = await self._with_client(_call)
+ except Exception as e:
+ logger.warning(
+ f"authorise failed: {type(e).__name__}: {e}; "
+ f"failing closed for "
+ f"{identity.principal_id!r} cap={capability!r}"
+ )
+ raise _auth_failure()
+
+ ttl = max(0, min(int(ttl or 0), AUTHZ_CACHE_TTL_MAX))
+ self._authz_cache[key] = (bool(allow), now + ttl)
+
+ if not allow:
+ raise _access_denied()
+ return
diff --git a/trustgraph-flow/trustgraph/gateway/capabilities.py b/trustgraph-flow/trustgraph/gateway/capabilities.py
new file mode 100644
index 00000000..72ca51c7
--- /dev/null
+++ b/trustgraph-flow/trustgraph/gateway/capabilities.py
@@ -0,0 +1,100 @@
+"""
+Gateway-side authorisation entry points.
+
+Under the IAM contract (see docs/tech-specs/iam-contract.md) the
+gateway holds *no* policy state. Roles, capability sets, and
+workspace-scope rules all live in the IAM regime (iam-svc for OSS).
+This module is the thin surface the gateway uses to ask the regime
+for a decision:
+
+- ``PUBLIC`` / ``AUTHENTICATED`` sentinels for endpoints that don't
+ go through capability-based authorisation.
+- :func:`enforce` — authenticate-only, then ask the regime.
+- :func:`enforce_workspace` — default-fill the workspace from the
+ caller's bound workspace and ask the regime, with the workspace
+ treated as the resource address.
+
+The capability strings themselves are an open vocabulary — see
+docs/tech-specs/capabilities.md. The gateway does not validate them
+beyond passing them through; an unknown capability simply produces a
+deny verdict from the regime.
+"""
+
+from aiohttp import web
+
+
+PUBLIC = "__public__"
+AUTHENTICATED = "__authenticated__"
+
+
+def access_denied():
+ return web.HTTPForbidden(
+ text='{"error":"access denied"}',
+ content_type="application/json",
+ )
+
+
+def auth_failure():
+ return web.HTTPUnauthorized(
+ text='{"error":"auth failure"}',
+ content_type="application/json",
+ )
+
+
+async def enforce(request, auth, capability):
+ """Authenticate the caller and (for non-sentinel capabilities)
+ ask the IAM regime whether they may invoke ``capability``.
+
+ The resource is system-level (``{}``) and parameters are empty —
+ use :func:`enforce_workspace` for workspace-scoped endpoints, or
+ drive authorisation through the operation registry for richer
+ cases.
+
+ - ``PUBLIC``: returns ``None`` — no authentication.
+ - ``AUTHENTICATED``: returns the ``Identity`` — no authorisation.
+ - capability string: returns the ``Identity`` if the regime
+ allows; raises ``HTTPForbidden`` otherwise.
+ """
+ if capability == PUBLIC:
+ return None
+
+ identity = await auth.authenticate(request)
+
+ if capability == AUTHENTICATED:
+ return identity
+
+ await auth.authorise(identity, capability, {}, {})
+ return identity
+
+
+async def enforce_workspace(data, identity, auth, capability=None):
+ """Default-fill the workspace on a request body and (optionally)
+ authorise the caller for ``capability`` against that workspace.
+
+ - Target workspace = ``data["workspace"]`` if supplied, else the
+ caller's bound workspace.
+ - On success, ``data["workspace"]`` is overwritten with the
+ resolved value so downstream code sees a single canonical
+ address.
+ - When ``capability`` is given, the regime is asked whether the
+ caller may invoke ``capability`` on ``{workspace: target}``.
+ Raises ``HTTPForbidden`` on a deny.
+
+ For ``capability=None`` no authorisation call is made — the
+ caller has presumably already authorised via :func:`enforce`
+ (handy for endpoints that authorise once then resolve workspace
+ on the body before forwarding).
+ """
+ if not isinstance(data, dict):
+ return data
+
+ requested = data.get("workspace", "")
+ target = requested or identity.workspace
+ data["workspace"] = target
+
+ if capability is not None:
+ await auth.authorise(
+ identity, capability, {"workspace": target}, {},
+ )
+
+ return data
diff --git a/trustgraph-flow/trustgraph/gateway/dispatch/iam.py b/trustgraph-flow/trustgraph/gateway/dispatch/iam.py
new file mode 100644
index 00000000..386233f5
--- /dev/null
+++ b/trustgraph-flow/trustgraph/gateway/dispatch/iam.py
@@ -0,0 +1,40 @@
+
+from ... schema import IamRequest, IamResponse
+from ... schema import iam_request_queue, iam_response_queue
+from ... messaging import TranslatorRegistry
+
+from . requestor import ServiceRequestor
+
+
+class IamRequestor(ServiceRequestor):
+ def __init__(self, backend, consumer, subscriber, timeout=120,
+ request_queue=None, response_queue=None):
+
+ if request_queue is None:
+ request_queue = iam_request_queue
+ if response_queue is None:
+ response_queue = iam_response_queue
+
+ super().__init__(
+ backend=backend,
+ consumer_name=consumer,
+ subscription=subscriber,
+ request_queue=request_queue,
+ response_queue=response_queue,
+ request_schema=IamRequest,
+ response_schema=IamResponse,
+ timeout=timeout,
+ )
+
+ self.request_translator = (
+ TranslatorRegistry.get_request_translator("iam")
+ )
+ self.response_translator = (
+ TranslatorRegistry.get_response_translator("iam")
+ )
+
+ def to_request(self, body):
+ return self.request_translator.decode(body)
+
+ def from_response(self, message):
+ return self.response_translator.encode_with_completion(message)
diff --git a/trustgraph-flow/trustgraph/gateway/dispatch/manager.py b/trustgraph-flow/trustgraph/gateway/dispatch/manager.py
index b238bb5b..ea8770d7 100644
--- a/trustgraph-flow/trustgraph/gateway/dispatch/manager.py
+++ b/trustgraph-flow/trustgraph/gateway/dispatch/manager.py
@@ -9,6 +9,7 @@ logger = logging.getLogger(__name__)
from . config import ConfigRequestor
from . flow import FlowRequestor
+from . iam import IamRequestor
from . librarian import LibrarianRequestor
from . knowledge import KnowledgeRequestor
from . collection_management import CollectionManagementRequestor
@@ -72,6 +73,7 @@ request_response_dispatchers = {
global_dispatchers = {
"config": ConfigRequestor,
"flow": FlowRequestor,
+ "iam": IamRequestor,
"librarian": LibrarianRequestor,
"knowledge": KnowledgeRequestor,
"collection-management": CollectionManagementRequestor,
@@ -105,13 +107,31 @@ class DispatcherWrapper:
class DispatcherManager:
- def __init__(self, backend, config_receiver, prefix="api-gateway",
- queue_overrides=None):
+ def __init__(self, backend, config_receiver, auth,
+ prefix="api-gateway", queue_overrides=None):
+ """
+ ``auth`` is required. It flows into the Mux for first-frame
+ WebSocket authentication and into downstream dispatcher
+ construction. There is no permissive default — constructing
+ a DispatcherManager without an authenticator would be a
+ silent downgrade to no-auth on the socket path.
+ """
+ if auth is None:
+ raise ValueError(
+ "DispatcherManager requires an 'auth' argument — there "
+ "is no no-auth mode"
+ )
+
self.backend = backend
self.config_receiver = config_receiver
self.config_receiver.add_handler(self)
self.prefix = prefix
+ # Gateway IamAuth — used by the socket Mux for first-frame
+ # auth and by any dispatcher that needs to resolve caller
+ # identity out-of-band.
+ self.auth = auth
+
# Store queue overrides for global services
# Format: {"config": {"request": "...", "response": "..."}, ...}
self.queue_overrides = queue_overrides or {}
@@ -163,6 +183,15 @@ class DispatcherManager:
def dispatch_global_service(self):
return DispatcherWrapper(self.process_global_service)
+ def dispatch_auth_iam(self):
+ """Pre-configured IAM dispatcher for the gateway's auth
+ endpoints (login, bootstrap, change-password). Pins the
+ kind to ``iam`` so these handlers don't have to supply URL
+ params the global dispatcher would expect."""
+ async def _process(data, responder):
+ return await self.invoke_global_service(data, responder, "iam")
+ return DispatcherWrapper(_process)
+
def dispatch_core_export(self):
return DispatcherWrapper(self.process_core_export)
@@ -314,7 +343,10 @@ class DispatcherManager:
async def process_socket(self, ws, running, params):
- dispatcher = Mux(self, ws, running)
+ # The mux self-authenticates via the first-frame protocol;
+ # pass the gateway's IamAuth so it can validate tokens
+ # without reaching back into the endpoint layer.
+ dispatcher = Mux(self, ws, running, auth=self.auth)
return dispatcher
diff --git a/trustgraph-flow/trustgraph/gateway/dispatch/mux.py b/trustgraph-flow/trustgraph/gateway/dispatch/mux.py
index 3d610dca..03cd748b 100644
--- a/trustgraph-flow/trustgraph/gateway/dispatch/mux.py
+++ b/trustgraph-flow/trustgraph/gateway/dispatch/mux.py
@@ -16,11 +16,28 @@ MAX_QUEUE_SIZE = 10
class Mux:
- def __init__(self, dispatcher_manager, ws, running):
+ def __init__(self, dispatcher_manager, ws, running, auth):
+ """
+ ``auth`` is required — the Mux implements the first-frame
+ auth protocol described in ``iam.md`` and will refuse any
+ non-auth frame until an ``auth-ok`` has been issued. There
+ is no no-auth mode.
+ """
+ if auth is None:
+ raise ValueError(
+ "Mux requires an 'auth' argument — there is no "
+ "no-auth mode"
+ )
self.dispatcher_manager = dispatcher_manager
self.ws = ws
self.running = running
+ self.auth = auth
+
+ # Authenticated identity, populated by the first-frame auth
+ # protocol. ``None`` means the socket is not yet
+ # authenticated; any non-auth frame is refused.
+ self.identity = None
self.q = asyncio.Queue(maxsize=MAX_QUEUE_SIZE)
@@ -31,6 +48,41 @@ class Mux:
if self.ws:
await self.ws.close()
+ async def _handle_auth_frame(self, data):
+ """Process a ``{"type": "auth", "token": "..."}`` frame.
+ On success, updates ``self.identity`` and returns an
+ ``auth-ok`` response frame. On failure, returns the masked
+ auth-failure frame. Never raises — auth failures keep the
+ socket open so the client can retry without reconnecting
+ (important for browsers, which treat a handshake-time 401
+ as terminal)."""
+ token = data.get("token", "")
+ if not token:
+ await self.ws.send_json({
+ "type": "auth-failed",
+ "error": "auth failure",
+ })
+ return
+
+ class _Shim:
+ def __init__(self, tok):
+ self.headers = {"Authorization": f"Bearer {tok}"}
+
+ try:
+ identity = await self.auth.authenticate(_Shim(token))
+ except Exception:
+ await self.ws.send_json({
+ "type": "auth-failed",
+ "error": "auth failure",
+ })
+ return
+
+ self.identity = identity
+ await self.ws.send_json({
+ "type": "auth-ok",
+ "workspace": identity.workspace,
+ })
+
async def receive(self, msg):
request_id = None
@@ -38,6 +90,16 @@ class Mux:
try:
data = msg.json()
+
+ # In-band auth protocol: the client sends
+ # ``{"type": "auth", "token": "..."}`` as its first frame
+ # (and any time it wants to re-auth: JWT refresh, token
+ # rotation, etc). Auth is always required on a Mux —
+ # there is no no-auth mode.
+ if isinstance(data, dict) and data.get("type") == "auth":
+ await self._handle_auth_frame(data)
+ return
+
request_id = data.get("id")
if "request" not in data:
@@ -46,9 +108,125 @@ class Mux:
if "id" not in data:
raise RuntimeError("Bad message")
+ # Reject all non-auth frames until an ``auth-ok`` has
+ # been issued.
+ if self.identity is None:
+ await self.ws.send_json({
+ "id": request_id,
+ "error": {
+ "message": "auth failure",
+ "type": "auth-required",
+ },
+ "complete": True,
+ })
+ return
+
+ # Per-service capability gating. Resolved through the
+ # operation registry so the WS path matches what HTTP
+ # callers see — same authority, same caps.
+ #
+ # Lookup mirrors the HTTP routing decision in
+ # ``request_task``: presence of ``flow`` on the envelope
+ # means a flow-level data-plane service (graph-rag,
+ # agent, …); absence means a workspace-level service
+ # (config, flow management, librarian, …) whose specific
+ # operation is in the inner request body. ``iam`` is
+ # treated as workspace-level too — its operations are
+ # registered with bare names, no kind prefix.
+ from ..registry import lookup as _registry_lookup
+ from ..capabilities import enforce_workspace
+ from aiohttp import web as _web
+
+ service = data.get("service", "")
+ inner = data.get("request") or {}
+ inner_op = inner.get("operation", "") if isinstance(inner, dict) else ""
+
+ if data.get("flow"):
+ op = _registry_lookup(f"flow-service:{service}")
+ elif service == "iam":
+ op = _registry_lookup(inner_op) if inner_op else None
+ else:
+ op = _registry_lookup(f"{service}:{inner_op}") if inner_op else None
+
+ if op is None:
+ await self.ws.send_json({
+ "id": request_id,
+ "error": {
+ "message": "unknown service",
+ "type": "unknown-service",
+ },
+ "complete": True,
+ })
+ return
+
+ # Resolve workspace first (default-fill from the caller's
+ # bound workspace), then ask the regime to authorise the
+ # service-level capability against the matched
+ # operation's resource shape.
+ try:
+ await enforce_workspace(data, self.identity, self.auth)
+ if isinstance(inner, dict):
+ await enforce_workspace(inner, self.identity, self.auth)
+
+ if data.get("flow"):
+ resource = {
+ "workspace": data.get("workspace", ""),
+ "flow": data.get("flow", ""),
+ }
+ parameters = {}
+ else:
+ # Build a minimal RequestContext so the matched
+ # operation's own extractors decide resource and
+ # parameters — same path the HTTP endpoints take.
+ from ..registry import RequestContext
+ ctx = RequestContext(
+ body=inner if isinstance(inner, dict) else {},
+ match_info={},
+ identity=self.identity,
+ )
+ resource = op.extract_resource(ctx)
+ parameters = op.extract_parameters(ctx)
+
+ await self.auth.authorise(
+ self.identity, op.capability, resource, parameters,
+ )
+ except _web.HTTPForbidden:
+ await self.ws.send_json({
+ "id": request_id,
+ "error": {
+ "message": "access denied",
+ "type": "access-denied",
+ },
+ "complete": True,
+ })
+ return
+ except _web.HTTPUnauthorized:
+ await self.ws.send_json({
+ "id": request_id,
+ "error": {
+ "message": "auth failure",
+ "type": "auth-required",
+ },
+ "complete": True,
+ })
+ return
+
+ workspace = data["workspace"]
+
+ # Plumb authenticated caller's handle as ``actor`` so
+ # iam-svc handlers (whoami, future actor-scoped checks)
+ # know who is calling. Overwrite any caller-supplied
+ # value so it can't be spoofed over the WS.
+ if (
+ service == "iam"
+ and isinstance(data.get("request"), dict)
+ and self.identity is not None
+ ):
+ data["request"]["actor"] = self.identity.handle
+
await self.q.put((
data["id"],
- data.get("workspace", "default"),
+ workspace,
data.get("flow"),
data["service"],
data["request"]
diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/auth_endpoints.py b/trustgraph-flow/trustgraph/gateway/endpoint/auth_endpoints.py
new file mode 100644
index 00000000..44bbc03e
--- /dev/null
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/auth_endpoints.py
@@ -0,0 +1,131 @@
+"""
+Gateway auth endpoints.
+
+Three dedicated paths:
+ POST /api/v1/auth/login — unauthenticated; username/password → JWT
+ POST /api/v1/auth/bootstrap — unauthenticated; IAM bootstrap op
+ POST /api/v1/auth/change-password — authenticated; any role
+
+These are the only IAM-surface operations that can be reached from
+outside. Everything else routes through ``/api/v1/iam`` gated by
+``users:admin``.
+"""
+
+import logging
+
+from aiohttp import web
+
+from .. capabilities import enforce, PUBLIC, AUTHENTICATED
+
+logger = logging.getLogger("auth-endpoints")
+logger.setLevel(logging.INFO)
+
+
+class AuthEndpoints:
+ """Groups the three auth-surface handlers. Each forwards to the
+ IAM service via the existing ``IamRequestor`` dispatcher."""
+
+ def __init__(self, iam_dispatcher, auth):
+ self.iam = iam_dispatcher
+ self.auth = auth
+
+ async def start(self):
+ pass
+
+ def add_routes(self, app):
+ app.add_routes([
+ web.post("/api/v1/auth/login", self.login),
+ web.post("/api/v1/auth/bootstrap", self.bootstrap),
+ web.post(
+ "/api/v1/auth/bootstrap-status",
+ self.bootstrap_status,
+ ),
+ web.post(
+ "/api/v1/auth/change-password",
+ self.change_password,
+ ),
+ ])
+
+ async def _forward(self, body):
+ async def responder(x, fin):
+ pass
+ return await self.iam.process(body, responder)
+
+ async def login(self, request):
+ """Public. Accepts {username, password, workspace?}. Returns
+ {jwt, jwt_expires} on success; IAM's masked auth failure on
+ anything else."""
+ await enforce(request, self.auth, PUBLIC)
+ try:
+ body = await request.json()
+ except Exception:
+ return web.json_response(
+ {"error": "invalid json"}, status=400,
+ )
+ req = {
+ "operation": "login",
+ "username": body.get("username", ""),
+ "password": body.get("password", ""),
+ "workspace": body.get("workspace", ""),
+ }
+ resp = await self._forward(req)
+ if "error" in resp:
+ return web.json_response(
+ {"error": "auth failure"}, status=401,
+ )
+ return web.json_response(resp)
+
+ async def bootstrap(self, request):
+ """Public. Valid only when IAM is running in bootstrap mode
+ with empty tables. In every other case the IAM service
+ returns a masked auth-failure."""
+ await enforce(request, self.auth, PUBLIC)
+ resp = await self._forward({"operation": "bootstrap"})
+ if "error" in resp:
+ return web.json_response(
+ {"error": "auth failure"}, status=401,
+ )
+ return web.json_response(resp)
+
+ async def bootstrap_status(self, request):
+ """Public, side-effect-free. Returns ``{"bootstrap_available":
+ bool}`` so a UI can decide whether to render first-run setup
+ without invoking the consuming ``bootstrap`` op."""
+ await enforce(request, self.auth, PUBLIC)
+ resp = await self._forward({"operation": "bootstrap-status"})
+ if "error" in resp:
+ return web.json_response(
+ {"error": "auth failure"}, status=401,
+ )
+ return web.json_response(resp)
+
+ async def change_password(self, request):
+ """Authenticated (any role). Accepts {current_password,
+ new_password}; user_id is taken from the authenticated
+ identity — the caller cannot change someone else's password
+ this way (reset-password is the admin path)."""
+ identity = await enforce(request, self.auth, AUTHENTICATED)
+ try:
+ body = await request.json()
+ except Exception:
+ return web.json_response(
+ {"error": "invalid json"}, status=400,
+ )
+ req = {
+ "operation": "change-password",
+ "user_id": identity.handle,
+ "password": body.get("current_password", ""),
+ "new_password": body.get("new_password", ""),
+ }
+ resp = await self._forward(req)
+ if "error" in resp:
+ err_type = resp.get("error", {}).get("type", "")
+ if err_type == "auth-failed":
+ return web.json_response(
+ {"error": "auth failure"}, status=401,
+ )
+ return web.json_response(
+ {"error": resp.get("error", {}).get("message", "error")},
+ status=400,
+ )
+ return web.json_response(resp)
diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/constant_endpoint.py b/trustgraph-flow/trustgraph/gateway/endpoint/constant_endpoint.py
index 58ba1738..920b02ca 100644
--- a/trustgraph-flow/trustgraph/gateway/endpoint/constant_endpoint.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/constant_endpoint.py
@@ -1,28 +1,27 @@
-import asyncio
-from aiohttp import web
-import uuid
import logging
+from aiohttp import web
+
+from .. capabilities import enforce, enforce_workspace
+
logger = logging.getLogger("endpoint")
logger.setLevel(logging.INFO)
+
class ConstantEndpoint:
- def __init__(self, endpoint_path, auth, dispatcher):
+ def __init__(self, endpoint_path, auth, dispatcher, capability):
self.path = endpoint_path
-
self.auth = auth
- self.operation = "service"
-
+ self.capability = capability
self.dispatcher = dispatcher
async def start(self):
pass
def add_routes(self, app):
-
app.add_routes([
web.post(self.path, self.handle),
])
@@ -31,22 +30,14 @@ class ConstantEndpoint:
logger.debug(f"Processing request: {request.path}")
- try:
- ht = request.headers["Authorization"]
- tokens = ht.split(" ", 2)
- if tokens[0] != "Bearer":
- return web.HTTPUnauthorized()
- token = tokens[1]
- except:
- token = ""
-
- if not self.auth.permitted(token, self.operation):
- return web.HTTPUnauthorized()
+ identity = await enforce(request, self.auth, self.capability)
try:
-
data = await request.json()
+ if identity is not None:
+ await enforce_workspace(data, identity, self.auth)
+
async def responder(x, fin):
pass
@@ -54,10 +45,8 @@ class ConstantEndpoint:
return web.json_response(resp)
+ except web.HTTPException:
+ raise
except Exception as e:
- logging.error(f"Exception: {e}")
-
- return web.json_response(
- { "error": str(e) }
- )
-
+ logger.error(f"Exception: {e}", exc_info=True)
+ return web.json_response({"error": str(e)})
diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/i18n.py b/trustgraph-flow/trustgraph/gateway/endpoint/i18n.py
index b949a499..f28f293d 100644
--- a/trustgraph-flow/trustgraph/gateway/endpoint/i18n.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/i18n.py
@@ -4,16 +4,18 @@ from aiohttp import web
from trustgraph.i18n import get_language_pack
+from .. capabilities import enforce
+
logger = logging.getLogger("endpoint")
logger.setLevel(logging.INFO)
class I18nPackEndpoint:
- def __init__(self, endpoint_path: str, auth):
+ def __init__(self, endpoint_path: str, auth, capability):
self.path = endpoint_path
self.auth = auth
- self.operation = "service"
+ self.capability = capability
async def start(self):
pass
@@ -26,26 +28,13 @@ class I18nPackEndpoint:
async def handle(self, request):
logger.debug(f"Processing i18n pack request: {request.path}")
- token = ""
- try:
- ht = request.headers["Authorization"]
- tokens = ht.split(" ", 2)
- if tokens[0] != "Bearer":
- return web.HTTPUnauthorized()
- token = tokens[1]
- except Exception:
- token = ""
-
- if not self.auth.permitted(token, self.operation):
- return web.HTTPUnauthorized()
+ await enforce(request, self.auth, self.capability)
lang = request.match_info.get("lang") or "en"
- # This is a path traversal defense, and is a critical sec defense.
- # Do not remove!
+ # Path-traversal defense — critical, do not remove.
if "/" in lang or ".." in lang:
return web.HTTPBadRequest(reason="Invalid language code")
pack = get_language_pack(lang)
-
return web.json_response(pack)
diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/iam_endpoint.py b/trustgraph-flow/trustgraph/gateway/endpoint/iam_endpoint.py
new file mode 100644
index 00000000..749eacd3
--- /dev/null
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/iam_endpoint.py
@@ -0,0 +1,114 @@
+"""
+Registry-driven /api/v1/iam endpoint.
+
+The gateway no longer gates IAM management with a single coarse
+``users:admin`` capability. Instead, each operation declares its
+own capability + resource shape in the registry (``registry.py``);
+this endpoint reads the body's ``operation`` field, looks up the
+declaration, and asks the IAM regime to authorise the call.
+
+Operations not in the registry produce a 400 ``unknown operation``.
+This is the gateway's primary mechanism for fail-closed gating of
+the IAM surface — the registry is the source of truth.
+"""
+
+import logging
+
+from aiohttp import web
+
+from .. capabilities import (
+ PUBLIC, AUTHENTICATED, auth_failure,
+)
+from .. registry import lookup, RequestContext
+
+logger = logging.getLogger("iam-endpoint")
+logger.setLevel(logging.INFO)
+
+
+class IamEndpoint:
+ """POST /api/v1/iam — generic forwarder gated by the operation
+ registry. The IAM dispatcher (``iam_dispatcher``) forwards the
+ body verbatim to iam-svc once authorisation succeeds."""
+
+ def __init__(self, endpoint_path, auth, dispatcher):
+ self.path = endpoint_path
+ self.auth = auth
+ self.dispatcher = dispatcher
+
+ async def start(self):
+ pass
+
+ def add_routes(self, app):
+ app.add_routes([web.post(self.path, self.handle)])
+
+ async def handle(self, request):
+ try:
+ body = await request.json()
+ except Exception:
+ return web.json_response(
+ {"error": "invalid json"}, status=400,
+ )
+ if not isinstance(body, dict):
+ return web.json_response(
+ {"error": "body must be an object"}, status=400,
+ )
+
+ op_name = body.get("operation", "")
+ op = lookup(op_name)
+ if op is None:
+ return web.json_response(
+ {"error": "unknown operation"}, status=400,
+ )
+
+ # Authentication: required for everything except PUBLIC.
+ identity = None
+ if op.capability != PUBLIC:
+ try:
+ identity = await self.auth.authenticate(request)
+ except web.HTTPException:
+ raise
+
+ # Authorisation: capability sentinels short-circuit the
+ # regime call; capability strings go through authorise().
+ if op.capability not in (PUBLIC, AUTHENTICATED):
+ ctx = RequestContext(
+ body=body,
+ match_info=dict(request.match_info),
+ identity=identity,
+ )
+ try:
+ resource = op.extract_resource(ctx)
+ parameters = op.extract_parameters(ctx)
+ except Exception as e:
+ logger.warning(
+ f"extractor failed for {op_name!r}: "
+ f"{type(e).__name__}: {e}"
+ )
+ return web.json_response(
+ {"error": "bad request"}, status=400,
+ )
+
+ await self.auth.authorise(
+ identity, op.capability, resource, parameters,
+ )
+
+ # Plumb the authenticated caller's handle through as ``actor``
+ # so iam-svc handlers (e.g. whoami, future actor-scoped
+ # checks) know who is making the request. The gateway is
+ # the only authority for this — body-supplied ``actor``
+ # values are overwritten so callers can't impersonate.
+ if identity is not None:
+ body["actor"] = identity.handle
+
+ async def responder(x, fin):
+ pass
+
+ try:
+ resp = await self.dispatcher.process(body, responder)
+ except web.HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"Exception: {e}", exc_info=True)
+ return web.json_response({"error": str(e)})
+
+ return web.json_response(resp)
diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/manager.py b/trustgraph-flow/trustgraph/gateway/endpoint/manager.py
index fb8b0b76..ed5ef4b5 100644
--- a/trustgraph-flow/trustgraph/gateway/endpoint/manager.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/manager.py
@@ -8,72 +8,269 @@ from . variable_endpoint import VariableEndpoint
from . socket import SocketEndpoint
from . metrics import MetricsEndpoint
from . i18n import I18nPackEndpoint
+from . auth_endpoints import AuthEndpoints
+from . iam_endpoint import IamEndpoint
+from . registry_endpoint import RegistryRoutedVariableEndpoint
+
+from .. capabilities import PUBLIC, AUTHENTICATED, auth_failure
+from .. registry import lookup as _registry_lookup, RequestContext
from .. dispatch.manager import DispatcherManager
+
+# /api/v1/{kind} (config / flow / librarian / knowledge /
+# collection-management), /api/v1/iam, and /api/v1/flow/{flow}/...
+# routes are all gated per-operation by the registry, not by a
+# per-kind capability map. Login / bootstrap / change-password are
+# served by AuthEndpoints with their own PUBLIC / AUTHENTICATED
+# sentinels.
+
+
+import logging as _mgr_logging
+_mgr_logger = _mgr_logging.getLogger("endpoint")
+
+
+class _RoutedVariableEndpoint:
+ """HTTP endpoint that gates per request via the operation
+ registry. The URL's ``kind`` parameter combined with a fixed
+ ``registry_prefix`` yields the registry key — e.g. prefix
+ ``flow-service`` and kind ``agent`` looks up
+ ``flow-service:agent``.
+
+ Used for ``/api/v1/flow/{flow}/service/{kind}`` (per-flow
+ data-plane services). ``/api/v1/{kind}`` (workspace-level
+ global services) goes through ``RegistryRoutedVariableEndpoint``
+ which discriminates on body operation as well as URL kind."""
+
+ def __init__(self, endpoint_path, auth, dispatcher, registry_prefix):
+ self.path = endpoint_path
+ self.auth = auth
+ self.dispatcher = dispatcher
+ self._registry_prefix = registry_prefix
+
+ async def start(self):
+ pass
+
+ def add_routes(self, app):
+ app.add_routes([web.post(self.path, self.handle)])
+
+ async def handle(self, request):
+ kind = request.match_info.get("kind", "")
+ op = _registry_lookup(f"{self._registry_prefix}:{kind}")
+ if op is None:
+ return web.json_response(
+ {"error": "unknown kind"}, status=404,
+ )
+
+ identity = await self.auth.authenticate(request)
+
+ try:
+ data = await request.json()
+ ctx = RequestContext(
+ body=data if isinstance(data, dict) else {},
+ match_info=dict(request.match_info),
+ identity=identity,
+ )
+ resource = op.extract_resource(ctx)
+ parameters = op.extract_parameters(ctx)
+ await self.auth.authorise(
+ identity, op.capability, resource, parameters,
+ )
+
+ async def responder(x, fin):
+ pass
+
+ resp = await self.dispatcher.process(
+ data, responder, request.match_info,
+ )
+ return web.json_response(resp)
+
+ except web.HTTPException:
+ raise
+ except Exception as e:
+ _mgr_logger.error(f"Exception: {e}", exc_info=True)
+ return web.json_response({"error": str(e)})
+
+
+class _RoutedSocketEndpoint:
+ """WebSocket endpoint gated per request via the operation
+ registry. Like ``_RoutedVariableEndpoint`` but for the
+ streaming flow import / export socket paths."""
+
+ def __init__(self, endpoint_path, auth, dispatcher, registry_prefix):
+ self.path = endpoint_path
+ self.auth = auth
+ self.dispatcher = dispatcher
+ self._registry_prefix = registry_prefix
+
+ async def start(self):
+ pass
+
+ def add_routes(self, app):
+ app.add_routes([web.get(self.path, self.handle)])
+
+ async def handle(self, request):
+ kind = request.match_info.get("kind", "")
+ op = _registry_lookup(f"{self._registry_prefix}:{kind}")
+ if op is None:
+ return web.json_response(
+ {"error": "unknown kind"}, status=404,
+ )
+
+ token = request.query.get("token", "")
+ if not token:
+ return auth_failure()
+
+ from . socket import _QueryTokenRequest
+ try:
+ identity = await self.auth.authenticate(
+ _QueryTokenRequest(token)
+ )
+ except web.HTTPException as e:
+ return e
+
+ ctx = RequestContext(
+ body={},
+ match_info=dict(request.match_info),
+ identity=identity,
+ )
+ try:
+ resource = op.extract_resource(ctx)
+ parameters = op.extract_parameters(ctx)
+ await self.auth.authorise(
+ identity, op.capability, resource, parameters,
+ )
+ except web.HTTPException as e:
+ return e
+
+ # Delegate the websocket handling to a standalone SocketEndpoint
+ # with the resolved capability, bypassing the per-request mutation
+ # concern by instantiating fresh state.
+ ws_ep = SocketEndpoint(
+ endpoint_path=self.path,
+ auth=self.auth,
+ dispatcher=self.dispatcher,
+ capability=op.capability,
+ )
+ return await ws_ep.handle(request)
+
+
class EndpointManager:
def __init__(
- self, dispatcher_manager, auth, prometheus_url, timeout=600
+ self, dispatcher_manager, auth, prometheus_url, timeout=600,
):
self.dispatcher_manager = dispatcher_manager
self.timeout = timeout
- self.services = {
- }
-
self.endpoints = [
+
+ # Auth surface — public / authenticated-any. Must come
+ # before the generic /api/v1/{kind} routes to win the
+ # match for /api/v1/auth/* paths. aiohttp routes in
+ # registration order, so we prepend here.
+ AuthEndpoints(
+ iam_dispatcher=dispatcher_manager.dispatch_auth_iam(),
+ auth=auth,
+ ),
+
+ # /api/v1/iam — registry-driven IAM management. Per
+ # operation gating happens inside IamEndpoint via the
+ # operation registry; the dispatcher forwards verbatim
+ # to iam-svc once authorisation has succeeded. Listed
+ # before the generic /api/v1/{kind} route so it wins
+ # the match for "iam".
+ IamEndpoint(
+ endpoint_path="/api/v1/iam",
+ auth=auth,
+ dispatcher=dispatcher_manager.dispatch_auth_iam(),
+ ),
+
I18nPackEndpoint(
- endpoint_path = "/api/v1/i18n/packs/{lang}",
- auth = auth,
+ endpoint_path="/api/v1/i18n/packs/{lang}",
+ auth=auth,
+ capability=PUBLIC,
),
MetricsEndpoint(
- endpoint_path = "/api/metrics",
- prometheus_url = prometheus_url,
- auth = auth,
+ endpoint_path="/api/metrics",
+ prometheus_url=prometheus_url,
+ auth=auth,
+ capability="metrics:read",
),
- VariableEndpoint(
- endpoint_path = "/api/v1/{kind}", auth = auth,
- dispatcher = dispatcher_manager.dispatch_global_service(),
+
+ # Global services: registry-driven per-operation gating.
+ # Each kind+op combination has a registry entry that
+ # declares its capability and resource shape. Listed
+ # after the IAM and auth-surface routes; aiohttp's
+ # path matcher prefers the more-specific path so this
+ # variable route doesn't shadow them.
+ RegistryRoutedVariableEndpoint(
+ endpoint_path="/api/v1/{kind}",
+ auth=auth,
+ dispatcher=dispatcher_manager.dispatch_global_service(),
),
+
+ # /api/v1/socket: WebSocket handshake accepts
+ # unconditionally; the Mux dispatcher runs the
+ # first-frame auth protocol. Handshake-time 401s break
+ # browser reconnection, so authentication is always
+ # in-band for this endpoint.
SocketEndpoint(
- endpoint_path = "/api/v1/socket",
- auth = auth,
- dispatcher = dispatcher_manager.dispatch_socket()
+ endpoint_path="/api/v1/socket",
+ auth=auth,
+ dispatcher=dispatcher_manager.dispatch_socket(),
+ capability=AUTHENTICATED, # informational only; bypassed
+ in_band_auth=True,
),
- VariableEndpoint(
- endpoint_path = "/api/v1/flow/{flow}/service/{kind}",
- auth = auth,
- dispatcher = dispatcher_manager.dispatch_flow_service(),
+
+ # Per-flow request/response services — gated per
+ # ``flow-service:`` registry entry.
+ _RoutedVariableEndpoint(
+ endpoint_path="/api/v1/flow/{flow}/service/{kind}",
+ auth=auth,
+ dispatcher=dispatcher_manager.dispatch_flow_service(),
+ registry_prefix="flow-service",
),
- SocketEndpoint(
- endpoint_path = "/api/v1/flow/{flow}/import/{kind}",
- auth = auth,
- dispatcher = dispatcher_manager.dispatch_flow_import()
+
+ # Per-flow streaming import/export — gated per
+ # ``flow-import:`` / ``flow-export:`` registry
+ # entry.
+ _RoutedSocketEndpoint(
+ endpoint_path="/api/v1/flow/{flow}/import/{kind}",
+ auth=auth,
+ dispatcher=dispatcher_manager.dispatch_flow_import(),
+ registry_prefix="flow-import",
),
- SocketEndpoint(
- endpoint_path = "/api/v1/flow/{flow}/export/{kind}",
- auth = auth,
- dispatcher = dispatcher_manager.dispatch_flow_export()
+ _RoutedSocketEndpoint(
+ endpoint_path="/api/v1/flow/{flow}/export/{kind}",
+ auth=auth,
+ dispatcher=dispatcher_manager.dispatch_flow_export(),
+ registry_prefix="flow-export",
+ ),
+
+ StreamEndpoint(
+ endpoint_path="/api/v1/import-core",
+ auth=auth,
+ method="POST",
+ dispatcher=dispatcher_manager.dispatch_core_import(),
+ # Cross-subject import — require the admin bundle via a
+ # single representative capability.
+ capability="users:admin",
),
StreamEndpoint(
- endpoint_path = "/api/v1/import-core",
- auth = auth,
- method = "POST",
- dispatcher = dispatcher_manager.dispatch_core_import(),
+ endpoint_path="/api/v1/export-core",
+ auth=auth,
+ method="GET",
+ dispatcher=dispatcher_manager.dispatch_core_export(),
+ capability="users:admin",
),
StreamEndpoint(
- endpoint_path = "/api/v1/export-core",
- auth = auth,
- method = "GET",
- dispatcher = dispatcher_manager.dispatch_core_export(),
- ),
- StreamEndpoint(
- endpoint_path = "/api/v1/document-stream",
- auth = auth,
- method = "GET",
- dispatcher = dispatcher_manager.dispatch_document_stream(),
+ endpoint_path="/api/v1/document-stream",
+ auth=auth,
+ method="GET",
+ dispatcher=dispatcher_manager.dispatch_document_stream(),
+ capability="documents:read",
),
]
@@ -84,4 +281,3 @@ class EndpointManager:
async def start(self):
for ep in self.endpoints:
await ep.start()
-
diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/metrics.py b/trustgraph-flow/trustgraph/gateway/endpoint/metrics.py
index d17d111b..6832d1e3 100644
--- a/trustgraph-flow/trustgraph/gateway/endpoint/metrics.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/metrics.py
@@ -10,17 +10,19 @@ import asyncio
import uuid
import logging
+from .. capabilities import enforce
+
logger = logging.getLogger("endpoint")
logger.setLevel(logging.INFO)
class MetricsEndpoint:
- def __init__(self, prometheus_url, endpoint_path, auth):
+ def __init__(self, prometheus_url, endpoint_path, auth, capability):
self.prometheus_url = prometheus_url
self.path = endpoint_path
self.auth = auth
- self.operation = "service"
+ self.capability = capability
async def start(self):
pass
@@ -35,38 +37,39 @@ class MetricsEndpoint:
logger.debug(f"Processing metrics request: {request.path}")
- try:
- ht = request.headers["Authorization"]
- tokens = ht.split(" ", 2)
- if tokens[0] != "Bearer":
- return web.HTTPUnauthorized()
- token = tokens[1]
- except:
- token = ""
+ await enforce(request, self.auth, self.capability)
- if not self.auth.permitted(token, self.operation):
- return web.HTTPUnauthorized()
+ path = request.match_info["path"]
+ url = (
+ self.prometheus_url + "/api/v1/" + path + "?" +
+ request.query_string
+ )
try:
- path = request.match_info["path"]
-
async with aiohttp.ClientSession() as session:
-
- url = (
- self.prometheus_url + "/api/v1/" + path + "?" +
- request.query_string
- )
-
async with session.get(url) as resp:
return web.Response(
status=resp.status,
text=await resp.text()
)
+ except aiohttp.ClientConnectionError as e:
+
+ # Upstream unreachable (connect refused, DNS failure,
+ # server disconnect). Distinguish from our own errors so
+ # callers know where the fault is.
+ logger.error(f"Metrics upstream {url} unreachable: {e}")
+ return web.Response(
+ status=502,
+ text=f"Bad Gateway: metrics upstream unreachable: {e}",
+ )
+
except Exception as e:
- logging.error(f"Exception: {e}")
-
- raise web.HTTPInternalServerError()
+ logger.error(f"Metrics proxy exception: {e}", exc_info=True)
+ return web.Response(
+ status=500,
+ text=f"Internal Server Error: {e}",
+ )
diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/registry_endpoint.py b/trustgraph-flow/trustgraph/gateway/endpoint/registry_endpoint.py
new file mode 100644
index 00000000..296376fa
--- /dev/null
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/registry_endpoint.py
@@ -0,0 +1,123 @@
+"""
+Registry-driven dispatch for ``/api/v1/{kind}`` global services.
+
+The body's ``operation`` field plus the URL's ``{kind}`` together
+form the canonical operation name (``:``) that the
+gateway looks up in ``registry.py``. The matched operation
+declares its capability and resource shape; this endpoint asks the
+IAM regime to authorise the call before forwarding the body
+verbatim to the backend dispatcher.
+
+The dispatcher is the same ``dispatch_global_service()`` factory the
+old coarse path used; only the gating layer has changed.
+
+Operations not present in the registry are rejected with 400
+``unknown operation`` — fail closed.
+"""
+
+import logging
+
+from aiohttp import web
+
+from .. capabilities import (
+ PUBLIC, AUTHENTICATED, auth_failure,
+)
+from .. registry import lookup, RequestContext
+
+logger = logging.getLogger("registry-endpoint")
+logger.setLevel(logging.INFO)
+
+
+class RegistryRoutedVariableEndpoint:
+ """POST /api/v1/{kind} — kind comes from the URL, operation comes
+ from the body, both are joined as the registry key."""
+
+ def __init__(self, endpoint_path, auth, dispatcher):
+ self.path = endpoint_path
+ self.auth = auth
+ self.dispatcher = dispatcher
+
+ async def start(self):
+ pass
+
+ def add_routes(self, app):
+ app.add_routes([web.post(self.path, self.handle)])
+
+ async def handle(self, request):
+ kind = request.match_info.get("kind", "")
+ if not kind:
+ return web.json_response(
+ {"error": "missing kind"}, status=404,
+ )
+
+ try:
+ body = await request.json()
+ except Exception:
+ return web.json_response(
+ {"error": "invalid json"}, status=400,
+ )
+ if not isinstance(body, dict):
+ return web.json_response(
+ {"error": "body must be an object"}, status=400,
+ )
+
+ op_name = body.get("operation", "")
+ if not op_name:
+ return web.json_response(
+ {"error": "missing operation"}, status=400,
+ )
+
+ registry_key = f"{kind}:{op_name}"
+ op = lookup(registry_key)
+ if op is None:
+ return web.json_response(
+ {"error": "unknown operation"}, status=400,
+ )
+
+ identity = None
+ if op.capability != PUBLIC:
+ identity = await self.auth.authenticate(request)
+
+ if op.capability not in (PUBLIC, AUTHENTICATED):
+ ctx = RequestContext(
+ body=body,
+ match_info=dict(request.match_info),
+ identity=identity,
+ )
+ try:
+ resource = op.extract_resource(ctx)
+ parameters = op.extract_parameters(ctx)
+ except Exception as e:
+ logger.warning(
+ f"extractor failed for {registry_key!r}: "
+ f"{type(e).__name__}: {e}"
+ )
+ return web.json_response(
+ {"error": "bad request"}, status=400,
+ )
+
+ await self.auth.authorise(
+ identity, op.capability, resource, parameters,
+ )
+
+ # Default-fill workspace into the body so downstream
+ # dispatchers see the canonical resolved value. The
+ # extractor has already pulled the workspace out;
+ # mirror it back to the body for the verbatim forward.
+ if "workspace" in resource:
+ body["workspace"] = resource["workspace"]
+
+ async def responder(x, fin):
+ pass
+
+ try:
+ resp = await self.dispatcher.process(
+ body, responder, request.match_info,
+ )
+ except web.HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"Exception: {e}", exc_info=True)
+ return web.json_response({"error": str(e)})
+
+ return web.json_response(resp)
diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/socket.py b/trustgraph-flow/trustgraph/gateway/endpoint/socket.py
index 9065761c..f53ad73b 100644
--- a/trustgraph-flow/trustgraph/gateway/endpoint/socket.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/socket.py
@@ -4,6 +4,9 @@ from aiohttp import web, WSMsgType
import logging
from .. running import Running
+from .. capabilities import (
+ PUBLIC, AUTHENTICATED, auth_failure,
+)
logger = logging.getLogger("socket")
logger.setLevel(logging.INFO)
@@ -11,12 +14,25 @@ logger.setLevel(logging.INFO)
class SocketEndpoint:
def __init__(
- self, endpoint_path, auth, dispatcher,
+ self, endpoint_path, auth, dispatcher, capability,
+ in_band_auth=False,
):
+ """
+ ``in_band_auth=True`` skips the handshake-time auth check.
+ The WebSocket handshake always succeeds; the dispatcher is
+ expected to gate itself via the first-frame auth protocol
+ (see ``Mux``).
+
+ This avoids the browser problem where a 401 on the handshake
+ is treated as permanent and prevents reconnection, and lets
+ long-lived sockets refresh their credential mid-session by
+ sending a new auth frame.
+ """
self.path = endpoint_path
self.auth = auth
- self.operation = "socket"
+ self.capability = capability
+ self.in_band_auth = in_band_auth
self.dispatcher = dispatcher
@@ -61,15 +77,33 @@ class SocketEndpoint:
raise
async def handle(self, request):
- """Enhanced handler with better cleanup"""
- try:
- token = request.query['token']
- except:
- token = ""
+ """Enhanced handler with better cleanup.
+
+ Auth: WebSocket clients pass the bearer token on the
+ ``?token=...`` query string; we wrap it into a synthetic
+ Authorization header before delegating to the standard auth
+ path so the IAM-backed flow (JWT / API key) applies uniformly.
+ The first-frame auth protocol described in the IAM spec is
+ a future upgrade."""
+
+ if not self.in_band_auth and self.capability != PUBLIC:
+ token = request.query.get("token", "")
+ if not token:
+ return auth_failure()
+ try:
+ identity = await self.auth.authenticate(
+ _QueryTokenRequest(token)
+ )
+ except web.HTTPException as e:
+ return e
+ if self.capability != AUTHENTICATED:
+ try:
+ await self.auth.authorise(
+ identity, self.capability, {}, {},
+ )
+ except web.HTTPException as e:
+ return e
- if not self.auth.permitted(token, self.operation):
- return web.HTTPUnauthorized()
-
# 50MB max message size
ws = web.WebSocketResponse(max_msg_size=52428800)
@@ -150,3 +184,11 @@ class SocketEndpoint:
web.get(self.path, self.handle),
])
+
+class _QueryTokenRequest:
+ """Minimal shim that exposes headers["Authorization"] to
+ IamAuth.authenticate(), derived from a query-string token."""
+
+ def __init__(self, token):
+ self.headers = {"Authorization": f"Bearer {token}"}
+
diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/stream_endpoint.py b/trustgraph-flow/trustgraph/gateway/endpoint/stream_endpoint.py
index 38d8846f..7b0c4692 100644
--- a/trustgraph-flow/trustgraph/gateway/endpoint/stream_endpoint.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/stream_endpoint.py
@@ -1,82 +1,64 @@
-import asyncio
-from aiohttp import web
import logging
+from aiohttp import web
+
+from .. capabilities import enforce
+
logger = logging.getLogger("endpoint")
logger.setLevel(logging.INFO)
+
class StreamEndpoint:
- def __init__(self, endpoint_path, auth, dispatcher, method="POST"):
-
+ def __init__(
+ self, endpoint_path, auth, dispatcher, capability, method="POST",
+ ):
self.path = endpoint_path
-
self.auth = auth
- self.operation = "service"
+ self.capability = capability
self.method = method
-
self.dispatcher = dispatcher
async def start(self):
pass
def add_routes(self, app):
-
if self.method == "POST":
- app.add_routes([
- web.post(self.path, self.handle),
- ])
+ app.add_routes([web.post(self.path, self.handle)])
elif self.method == "GET":
- app.add_routes([
- web.get(self.path, self.handle),
- ])
+ app.add_routes([web.get(self.path, self.handle)])
else:
- raise RuntimeError("Bad method" + self.method)
+ raise RuntimeError("Bad method " + self.method)
async def handle(self, request):
logger.debug(f"Processing request: {request.path}")
- try:
- ht = request.headers["Authorization"]
- tokens = ht.split(" ", 2)
- if tokens[0] != "Bearer":
- return web.HTTPUnauthorized()
- token = tokens[1]
- except:
- token = ""
-
- if not self.auth.permitted(token, self.operation):
- return web.HTTPUnauthorized()
+ await enforce(request, self.auth, self.capability)
try:
-
data = request.content
async def error(err):
- return web.HTTPInternalServerError(text = err)
+ return web.HTTPInternalServerError(text=err)
async def ok(
- status=200, reason="OK", type="application/octet-stream"
+ status=200, reason="OK",
+ type="application/octet-stream",
):
response = web.StreamResponse(
- status = status, reason = reason,
- headers = {"Content-Type": type}
+ status=status, reason=reason,
+ headers={"Content-Type": type},
)
await response.prepare(request)
return response
- resp = await self.dispatcher.process(
- data, error, ok, request
- )
-
+ resp = await self.dispatcher.process(data, error, ok, request)
return resp
+ except web.HTTPException:
+ raise
except Exception as e:
- logging.error(f"Exception: {e}")
-
- return web.json_response(
- { "error": str(e) }
- )
-
+ logger.error(f"Exception: {e}", exc_info=True)
+ return web.json_response({"error": str(e)})
diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/variable_endpoint.py b/trustgraph-flow/trustgraph/gateway/endpoint/variable_endpoint.py
index 608de71b..6a336f42 100644
--- a/trustgraph-flow/trustgraph/gateway/endpoint/variable_endpoint.py
+++ b/trustgraph-flow/trustgraph/gateway/endpoint/variable_endpoint.py
@@ -1,27 +1,27 @@
-import asyncio
-from aiohttp import web
import logging
+from aiohttp import web
+
+from .. capabilities import enforce, enforce_workspace
+
logger = logging.getLogger("endpoint")
logger.setLevel(logging.INFO)
+
class VariableEndpoint:
- def __init__(self, endpoint_path, auth, dispatcher):
+ def __init__(self, endpoint_path, auth, dispatcher, capability):
self.path = endpoint_path
-
self.auth = auth
- self.operation = "service"
-
+ self.capability = capability
self.dispatcher = dispatcher
async def start(self):
pass
def add_routes(self, app):
-
app.add_routes([
web.post(self.path, self.handle),
])
@@ -30,35 +30,25 @@ class VariableEndpoint:
logger.debug(f"Processing request: {request.path}")
- try:
- ht = request.headers["Authorization"]
- tokens = ht.split(" ", 2)
- if tokens[0] != "Bearer":
- return web.HTTPUnauthorized()
- token = tokens[1]
- except:
- token = ""
-
- if not self.auth.permitted(token, self.operation):
- return web.HTTPUnauthorized()
+ identity = await enforce(request, self.auth, self.capability)
try:
-
data = await request.json()
+ if identity is not None:
+ await enforce_workspace(data, identity, self.auth)
+
async def responder(x, fin):
pass
resp = await self.dispatcher.process(
- data, responder, request.match_info
+ data, responder, request.match_info,
)
return web.json_response(resp)
+ except web.HTTPException:
+ raise
except Exception as e:
- logging.error(f"Exception: {e}")
-
- return web.json_response(
- { "error": str(e) }
- )
-
+ logger.error(f"Exception: {e}", exc_info=True)
+ return web.json_response({"error": str(e)})
diff --git a/trustgraph-flow/trustgraph/gateway/registry.py b/trustgraph-flow/trustgraph/gateway/registry.py
new file mode 100644
index 00000000..5e3344f4
--- /dev/null
+++ b/trustgraph-flow/trustgraph/gateway/registry.py
@@ -0,0 +1,533 @@
+"""
+Gateway operation registry.
+
+Single declarative table mapping each operation the gateway
+recognises to:
+
+- The capability the IAM regime is asked to authorise against.
+- The resource level (system / workspace / flow) — determines the
+ shape of the resource identifier handed to ``authorise``.
+- Extractors that build the resource and parameters from the
+ request context.
+
+This is a gateway-internal concept. It is not part of the IAM
+contract — the contract specifies what arguments ``authorise``
+receives; the registry is how the gateway populates them.
+
+See docs/tech-specs/iam-contract.md for the contract and
+docs/tech-specs/iam.md for the request anatomy.
+"""
+
+from dataclasses import dataclass, field
+from typing import Any, Callable
+
+
+# Sentinels for operations that don't go through capability-based
+# authorisation. Mirror the values used in capabilities.py so the
+# gateway endpoint layer can recognise them uniformly.
+PUBLIC = "__public__"
+AUTHENTICATED = "__authenticated__"
+
+
+class ResourceLevel:
+ """Where the operation's resource lives.
+
+ ``SYSTEM`` — operation acts on a deployment-level resource
+ (the user registry, the workspace registry,
+ the signing key). resource = {}. Workspace,
+ if relevant, is a parameter, not an address.
+
+ ``WORKSPACE`` — operation acts on something within a workspace
+ (config, library, knowledge, collections, flow
+ lifecycle). resource = {workspace}.
+
+ ``FLOW`` — operation acts on something within a flow
+ within a workspace (graph, agent, llm, etc.).
+ resource = {workspace, flow}.
+ """
+ SYSTEM = "system"
+ WORKSPACE = "workspace"
+ FLOW = "flow"
+
+
+@dataclass
+class RequestContext:
+ """The bundle of inputs the registry's extractors operate on.
+ Assembled by the gateway from the incoming request after
+ authentication."""
+
+ # Parsed JSON body (HTTP) or inner request payload (WebSocket).
+ body: dict = field(default_factory=dict)
+
+ # URL path components (HTTP) or WebSocket envelope routing
+ # fields (id, service, workspace, flow).
+ match_info: dict = field(default_factory=dict)
+
+ # Authenticated identity for default-fill-in. Always present
+ # by the time extractors run, except for PUBLIC operations
+ # where it is None.
+ identity: Any = None
+
+
+@dataclass
+class Operation:
+ """Declared operation the gateway can dispatch + authorise."""
+
+ # Canonical operation name (used for registry lookup, audit,
+ # debug logs). Mirrors the operation strings in the IAM
+ # service and other backends where applicable.
+ name: str
+
+ # Capability required to invoke this operation. Either a
+ # string from the capability vocabulary in capabilities.md, or
+ # the PUBLIC / AUTHENTICATED sentinel for operations that
+ # don't go through capability-based authorisation.
+ capability: str
+
+ # Where the operation's resource lives. Determines the
+ # shape of the resource argument passed to authorise.
+ resource_level: str
+
+ # Build the resource identifier from the request context.
+ # Returns a dict with the appropriate components for the
+ # resource level: {} for SYSTEM, {workspace} for WORKSPACE,
+ # {workspace, flow} for FLOW. Default-fill-in of workspace
+ # from identity.workspace happens here when applicable.
+ extract_resource: Callable[[RequestContext], dict]
+
+ # Build the parameters dict — decision-relevant fields the
+ # operation supplied that are not part of the resource
+ # address. E.g. workspace association on a system-level
+ # user-registry operation.
+ extract_parameters: Callable[[RequestContext], dict]
+
+
+# ---------------------------------------------------------------------------
+# Registry storage.
+# ---------------------------------------------------------------------------
+
+
+_REGISTRY: dict[str, Operation] = {}
+
+
+def register(op: Operation) -> None:
+ if op.name in _REGISTRY:
+ raise RuntimeError(
+ f"operation {op.name!r} already registered"
+ )
+ _REGISTRY[op.name] = op
+
+
+def lookup(name: str) -> Operation | None:
+ return _REGISTRY.get(name)
+
+
+def all_operations() -> list[Operation]:
+ return list(_REGISTRY.values())
+
+
+# ---------------------------------------------------------------------------
+# Common extractor helpers.
+# ---------------------------------------------------------------------------
+
+
+def _empty_resource(_ctx: RequestContext) -> dict:
+ """System-level resource: empty dict."""
+ return {}
+
+
+def _workspace_from_body(ctx: RequestContext) -> dict:
+ """Workspace-level resource sourced from the request body's
+ workspace field, defaulting to the caller's bound workspace."""
+ ws = (ctx.body.get("workspace") if isinstance(ctx.body, dict) else "")
+ if not ws and ctx.identity is not None:
+ ws = ctx.identity.workspace
+ return {"workspace": ws}
+
+
+def _flow_from_match_info(ctx: RequestContext) -> dict:
+ """Flow-level resource sourced from URL path components or WS
+ envelope fields. Both ``workspace`` and ``flow`` are required;
+ no default-fill-in (the address is the operation's identity)."""
+ return {
+ "workspace": ctx.match_info.get("workspace", ""),
+ "flow": ctx.match_info.get("flow", ""),
+ }
+
+
+def _no_parameters(_ctx: RequestContext) -> dict:
+ return {}
+
+
+def _body_as_parameters(ctx: RequestContext) -> dict:
+ """All body fields are parameters — used when the operation's
+ body is small and uniformly decision-relevant (e.g. user-
+ registry ops where the body's user.workspace is what the
+ regime checks against the admin's scope)."""
+ return dict(ctx.body) if isinstance(ctx.body, dict) else {}
+
+
+def _workspace_param_only(ctx: RequestContext) -> dict:
+ """Parameters dict carrying only the workspace association.
+ Used by system-level operations (e.g. user-registry ops) where
+ the workspace isn't part of the resource address but is the
+ field the regime uses to scope the admin's authority.
+
+ Pulls the workspace from the inner ``user`` / ``workspace_record``
+ body field if present (create-user, create-workspace), then from
+ the top-level body, then from the caller's bound workspace."""
+ body = ctx.body if isinstance(ctx.body, dict) else {}
+ inner_user = body.get("user") if isinstance(body.get("user"), dict) else {}
+ inner_ws = (
+ body.get("workspace_record")
+ if isinstance(body.get("workspace_record"), dict) else {}
+ )
+ ws = (
+ inner_user.get("workspace")
+ or inner_ws.get("id")
+ or body.get("workspace")
+ )
+ if not ws and ctx.identity is not None:
+ ws = ctx.identity.workspace
+ return {"workspace": ws or ""}
+
+
+# ---------------------------------------------------------------------------
+# Operation registrations.
+#
+# The gateway looks operations up by their canonical name (the same
+# string the request body / WS envelope carries in its ``operation``
+# field where applicable). Auth-surface operations (login, bootstrap,
+# change-password) are not listed here — they have their own routes
+# in auth_endpoints.py and use PUBLIC / AUTHENTICATED sentinels
+# directly. Pure gateway↔IAM internal operations (resolve-api-key,
+# authorise, authorise-many, get-signing-key-public) are likewise
+# excluded; they are never invoked over the public API.
+# ---------------------------------------------------------------------------
+
+
+# IAM management operations. All routed through /api/v1/iam, body
+# carries ``operation`` plus operation-specific fields.
+
+# User registry: SYSTEM-level resource (users are global, identified
+# by handle). The admin's authority is scoped per workspace via the
+# parameters {workspace} field — that's what the regime checks
+# against the admin's role workspace_scope.
+register(Operation(
+ name="create-user",
+ capability="users:admin",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_workspace_param_only,
+))
+register(Operation(
+ name="list-users",
+ capability="users:read",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_workspace_param_only,
+))
+register(Operation(
+ name="get-user",
+ capability="users:read",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_workspace_param_only,
+))
+register(Operation(
+ name="update-user",
+ capability="users:write",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_workspace_param_only,
+))
+register(Operation(
+ name="disable-user",
+ capability="users:admin",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_workspace_param_only,
+))
+register(Operation(
+ name="enable-user",
+ capability="users:admin",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_workspace_param_only,
+))
+register(Operation(
+ name="delete-user",
+ capability="users:admin",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_workspace_param_only,
+))
+register(Operation(
+ name="reset-password",
+ capability="users:admin",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_workspace_param_only,
+))
+
+
+# API keys: SYSTEM-level resource — like users, a key record exists
+# in the deployment-wide keys registry. The workspace the key
+# authenticates to is a property of the record, not a containment;
+# it appears as a parameter so the regime can scope the admin's
+# authority to issue / list / revoke against it.
+register(Operation(
+ name="create-api-key",
+ capability="keys:admin",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_workspace_param_only,
+))
+register(Operation(
+ name="list-api-keys",
+ capability="keys:admin",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_workspace_param_only,
+))
+register(Operation(
+ name="revoke-api-key",
+ capability="keys:admin",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_workspace_param_only,
+))
+
+
+# Workspace registry: SYSTEM-level resource (workspaces are the
+# top-level addressable unit). No parameters — the workspace being
+# acted on is identified by the body, not used as a scope cue.
+register(Operation(
+ name="create-workspace",
+ capability="workspaces:admin",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_no_parameters,
+))
+register(Operation(
+ name="list-workspaces",
+ capability="workspaces:admin",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_no_parameters,
+))
+register(Operation(
+ name="get-workspace",
+ capability="workspaces:admin",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_no_parameters,
+))
+register(Operation(
+ name="update-workspace",
+ capability="workspaces:admin",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_no_parameters,
+))
+register(Operation(
+ name="disable-workspace",
+ capability="workspaces:admin",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_no_parameters,
+))
+
+
+# Signing key: SYSTEM-level operational op.
+register(Operation(
+ name="rotate-signing-key",
+ capability="iam:admin",
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_no_parameters,
+))
+
+
+# ---------------------------------------------------------------------------
+# Auth-surface entries.
+#
+# Listed here so the registry is the one place the gateway looks for
+# operation→capability mappings — including the sentinels for paths
+# that don't go through capability-based authorisation. The actual
+# routing is in auth_endpoints.py; these entries let the registry-
+# driven dispatcher recognise the operation if it sees it on a
+# generic path.
+# ---------------------------------------------------------------------------
+
+register(Operation(
+ name="login",
+ capability=PUBLIC,
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_no_parameters,
+))
+register(Operation(
+ name="bootstrap",
+ capability=PUBLIC,
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_no_parameters,
+))
+register(Operation(
+ name="bootstrap-status",
+ capability=PUBLIC,
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_no_parameters,
+))
+register(Operation(
+ name="change-password",
+ capability=AUTHENTICATED,
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_no_parameters,
+))
+register(Operation(
+ name="whoami",
+ capability=AUTHENTICATED,
+ resource_level=ResourceLevel.SYSTEM,
+ extract_resource=_empty_resource,
+ extract_parameters=_no_parameters,
+))
+
+
+# ---------------------------------------------------------------------------
+# Generic kind/operation entries.
+#
+# Names are ``:`` so the registry key is unique
+# across dispatchers. All entries below are workspace-level
+# resources (workspace defaulted from the caller's bound workspace
+# if absent). Read/write distinction maps to the existing
+# ``:read`` / ``:write`` capability vocabulary
+# defined in capabilities.md.
+# ---------------------------------------------------------------------------
+
+
+def _register_kind_op(kind: str, op: str, capability: str) -> None:
+ """Helper: register a workspace-level kind:op with the standard
+ extractors (workspace from body, no extra parameters)."""
+ register(Operation(
+ name=f"{kind}:{op}",
+ capability=capability,
+ resource_level=ResourceLevel.WORKSPACE,
+ extract_resource=_workspace_from_body,
+ extract_parameters=_no_parameters,
+ ))
+
+
+# config: KV-style workspace config service.
+for _op in ("get", "list", "getvalues", "getvalues-all-ws", "config"):
+ _register_kind_op("config", _op, "config:read")
+for _op in ("put", "delete"):
+ _register_kind_op("config", _op, "config:write")
+
+
+# flow: flow-blueprint and flow-lifecycle service.
+for _op in ("list-blueprints", "get-blueprint", "list-flows", "get-flow"):
+ _register_kind_op("flow", _op, "flows:read")
+for _op in ("put-blueprint", "delete-blueprint", "start-flow", "stop-flow"):
+ _register_kind_op("flow", _op, "flows:write")
+
+
+# librarian: document storage and processing service.
+for _op in (
+ "get-document-metadata", "get-document-content",
+ "stream-document", "list-documents", "list-processing",
+ "get-upload-status", "list-uploads",
+):
+ _register_kind_op("librarian", _op, "documents:read")
+for _op in (
+ "add-document", "remove-document", "update-document",
+ "add-processing", "remove-processing",
+ "begin-upload", "upload-chunk", "complete-upload", "abort-upload",
+):
+ _register_kind_op("librarian", _op, "documents:write")
+
+
+# knowledge: knowledge-graph core service.
+for _op in ("get-kg-core", "list-kg-cores"):
+ _register_kind_op("knowledge", _op, "knowledge:read")
+for _op in ("put-kg-core", "delete-kg-core",
+ "load-kg-core", "unload-kg-core"):
+ _register_kind_op("knowledge", _op, "knowledge:write")
+
+
+# collection-management: workspace collection lifecycle.
+_register_kind_op("collection-management", "list-collections", "collections:read")
+for _op in ("update-collection", "delete-collection"):
+ _register_kind_op("collection-management", _op, "collections:write")
+
+
+# ---------------------------------------------------------------------------
+# Per-flow data-plane services.
+#
+# /api/v1/flow/{flow}/service/{kind} and the streaming
+# /api/v1/flow/{flow}/{import,export}/{kind} paths. No body-level
+# ``operation`` discriminator — the URL kind is the operation
+# identity. Resource is FLOW level (workspace + flow).
+#
+# Names: ``flow-service:``, ``flow-import:``,
+# ``flow-export:``.
+# ---------------------------------------------------------------------------
+
+
+def _register_flow_kind(prefix: str, kind: str, capability: str) -> None:
+ register(Operation(
+ name=f"{prefix}:{kind}",
+ capability=capability,
+ resource_level=ResourceLevel.FLOW,
+ extract_resource=_flow_from_match_info,
+ extract_parameters=_no_parameters,
+ ))
+
+
+# Request/response services on /api/v1/flow/{flow}/service/{kind}.
+_FLOW_SERVICES = {
+ "agent": "agent",
+ "text-completion": "llm",
+ "prompt": "llm",
+ "mcp-tool": "mcp",
+ "graph-rag": "graph:read",
+ "document-rag": "documents:read",
+ "embeddings": "embeddings",
+ "graph-embeddings": "graph:read",
+ "document-embeddings": "documents:read",
+ "triples": "graph:read",
+ "rows": "rows:read",
+ "nlp-query": "rows:read",
+ "structured-query": "rows:read",
+ "structured-diag": "rows:read",
+ "row-embeddings": "rows:read",
+ "sparql": "graph:read",
+}
+for _kind, _cap in _FLOW_SERVICES.items():
+ _register_flow_kind("flow-service", _kind, _cap)
+
+
+# Streaming import socket endpoints.
+_FLOW_IMPORTS = {
+ "triples": "graph:write",
+ "graph-embeddings": "graph:write",
+ "document-embeddings": "documents:write",
+ "entity-contexts": "documents:write",
+ "rows": "rows:write",
+}
+for _kind, _cap in _FLOW_IMPORTS.items():
+ _register_flow_kind("flow-import", _kind, _cap)
+
+
+# Streaming export socket endpoints.
+_FLOW_EXPORTS = {
+ "triples": "graph:read",
+ "graph-embeddings": "graph:read",
+ "document-embeddings": "documents:read",
+ "entity-contexts": "documents:read",
+}
+for _kind, _cap in _FLOW_EXPORTS.items():
+ _register_flow_kind("flow-export", _kind, _cap)
diff --git a/trustgraph-flow/trustgraph/gateway/service.py b/trustgraph-flow/trustgraph/gateway/service.py
index 4e465bf7..f75f3b25 100755
--- a/trustgraph-flow/trustgraph/gateway/service.py
+++ b/trustgraph-flow/trustgraph/gateway/service.py
@@ -12,7 +12,7 @@ import os
from trustgraph.base.logging import setup_logging, add_logging_args
from trustgraph.base.pubsub import get_pubsub, add_pubsub_args
-from . auth import Authenticator
+from . auth import IamAuth
from . config.receiver import ConfigReceiver
from . dispatch.manager import DispatcherManager
@@ -35,7 +35,6 @@ default_prometheus_url = os.getenv("PROMETHEUS_URL", "http://prometheus:9090")
default_pulsar_api_key = os.getenv("PULSAR_API_KEY", None)
default_timeout = 600
default_port = 8088
-default_api_token = os.getenv("GATEWAY_SECRET", "")
class Api:
@@ -60,13 +59,14 @@ class Api:
if not self.prometheus_url.endswith("/"):
self.prometheus_url += "/"
- api_token = config.get("api_token", default_api_token)
-
- # Token not set, or token equal empty string means no auth
- if api_token:
- self.auth = Authenticator(token=api_token)
- else:
- self.auth = Authenticator(allow_all=True)
+ # IAM-backed authentication. The legacy GATEWAY_SECRET
+ # shared-token path has been removed — there is no
+ # "open for everyone" fallback. The gateway cannot
+ # authenticate any request until IAM is reachable.
+ self.auth = IamAuth(
+ backend=self.pubsub_backend,
+ id=config.get("id", "api-gateway"),
+ )
self.config_receiver = ConfigReceiver(self.pubsub_backend)
@@ -118,6 +118,7 @@ class Api:
config_receiver = self.config_receiver,
prefix = "gateway",
queue_overrides = queue_overrides,
+ auth = self.auth,
)
self.endpoint_manager = EndpointManager(
@@ -132,12 +133,18 @@ class Api:
]
async def app_factory(self):
-
+
self.app = web.Application(
middlewares=[],
client_max_size=256 * 1024 * 1024
)
+ # Fetch IAM signing public key before accepting traffic.
+ # Blocks for a bounded retry window; the gateway starts even
+ # if IAM is still unreachable (JWT validation will 401 until
+ # the key is available).
+ await self.auth.start()
+
await self.config_receiver.start()
for ep in self.endpoints:
@@ -189,12 +196,6 @@ def run():
help=f'API request timeout in seconds (default: {default_timeout})',
)
- parser.add_argument(
- '--api-token',
- default=default_api_token,
- help=f'Secret API token (default: no auth)',
- )
-
add_logging_args(parser)
parser.add_argument(
diff --git a/trustgraph-flow/trustgraph/iam/__init__.py b/trustgraph-flow/trustgraph/iam/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/trustgraph-flow/trustgraph/iam/service/__init__.py b/trustgraph-flow/trustgraph/iam/service/__init__.py
new file mode 100644
index 00000000..98f4d9da
--- /dev/null
+++ b/trustgraph-flow/trustgraph/iam/service/__init__.py
@@ -0,0 +1 @@
+from . service import *
diff --git a/trustgraph-flow/trustgraph/iam/service/__main__.py b/trustgraph-flow/trustgraph/iam/service/__main__.py
new file mode 100644
index 00000000..a731dd63
--- /dev/null
+++ b/trustgraph-flow/trustgraph/iam/service/__main__.py
@@ -0,0 +1,4 @@
+
+from . service import run
+
+run()
diff --git a/trustgraph-flow/trustgraph/iam/service/iam.py b/trustgraph-flow/trustgraph/iam/service/iam.py
new file mode 100644
index 00000000..c89f65b0
--- /dev/null
+++ b/trustgraph-flow/trustgraph/iam/service/iam.py
@@ -0,0 +1,1358 @@
+"""
+IAM business logic. Handles ``IamRequest`` messages and builds
+``IamResponse`` messages. Does not concern itself with transport.
+
+See docs/tech-specs/iam-protocol.md for the wire-level contract and
+docs/tech-specs/iam.md for the surrounding architecture.
+"""
+
+import asyncio
+import base64
+import datetime
+import hashlib
+import json
+import logging
+import os
+import secrets
+import uuid
+
+from cryptography.hazmat.primitives import serialization
+from cryptography.hazmat.primitives.asymmetric import ed25519
+
+from trustgraph.schema import (
+ IamResponse, Error,
+ UserRecord, WorkspaceRecord, ApiKeyRecord,
+)
+
+from ... tables.iam import IamTableStore
+
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_WORKSPACE = "default"
+BOOTSTRAP_ADMIN_USERNAME = "admin"
+BOOTSTRAP_ADMIN_NAME = "Administrator"
+
+PBKDF2_ITERATIONS = 600_000
+API_KEY_PREFIX = "tg_"
+API_KEY_RANDOM_BYTES = 24
+
+JWT_ISSUER = "trustgraph-iam"
+JWT_TTL_SECONDS = 3600
+
+# Default authorisation cache TTL the regime tells the gateway to
+# observe. 60s is the OSS-spec maximum revocation latency: a role
+# change, workspace disable, or key revoke takes effect within at
+# most this much time.
+AUTHZ_CACHE_TTL_SECONDS = 60
+
+
+# OSS regime role table. Lives here, not in the gateway — the
+# gateway is regime-agnostic and must not encode policy.
+#
+# Each role has a capability set and a workspace scope. The
+# evaluator (handle_authorise below) checks (a) that some role
+# held by the caller grants the requested capability, and (b)
+# that role's workspace scope permits the target workspace.
+
+_READER_CAPS = {
+ "agent",
+ "graph:read",
+ "documents:read",
+ "rows:read",
+ "llm",
+ "embeddings",
+ "mcp",
+ "config:read",
+ "flows:read",
+ "collections:read",
+ "knowledge:read",
+ "keys:self",
+}
+
+_WRITER_CAPS = _READER_CAPS | {
+ "graph:write",
+ "documents:write",
+ "rows:write",
+ "collections:write",
+ "knowledge:write",
+}
+
+_ADMIN_CAPS = _WRITER_CAPS | {
+ "config:write",
+ "flows:write",
+ "users:read", "users:write", "users:admin",
+ "keys:admin",
+ "workspaces:admin",
+ "iam:admin",
+ "metrics:read",
+}
+
+ROLE_DEFINITIONS = {
+ "reader": {
+ "capabilities": _READER_CAPS,
+ "workspace_scope": "assigned",
+ },
+ "writer": {
+ "capabilities": _WRITER_CAPS,
+ "workspace_scope": "assigned",
+ },
+ "admin": {
+ "capabilities": _ADMIN_CAPS,
+ "workspace_scope": "*",
+ },
+}
+
+
+def _scope_permits(role_scope, target_workspace, assigned_workspace):
+ """Does the given role apply to ``target_workspace``?"""
+ if role_scope == "*":
+ return True
+ if role_scope == "assigned":
+ return target_workspace == assigned_workspace
+ return False
+
+
+def _now_iso():
+ return datetime.datetime.now(datetime.timezone.utc).isoformat()
+
+
+def _now_dt():
+ return datetime.datetime.now(datetime.timezone.utc)
+
+
+def _iso(dt):
+ if dt is None:
+ return ""
+ if isinstance(dt, str):
+ return dt
+ if dt.tzinfo is None:
+ dt = dt.replace(tzinfo=datetime.timezone.utc)
+ return dt.isoformat()
+
+
+def _hash_password(password):
+ """Return an encoded PBKDF2-SHA-256 hash of ``password``.
+
+ Format: ``pbkdf2-sha256$$$``. Stored
+ verbatim in the password_hash column so the algorithm and cost
+ can be evolved later (new rows get a new prefix; old rows are
+ verified with their own parameters).
+ """
+ salt = os.urandom(16)
+ dk = hashlib.pbkdf2_hmac(
+ "sha256", password.encode("utf-8"), salt, PBKDF2_ITERATIONS,
+ )
+ return (
+ f"pbkdf2-sha256${PBKDF2_ITERATIONS}"
+ f"${base64.b64encode(salt).decode('ascii')}"
+ f"${base64.b64encode(dk).decode('ascii')}"
+ )
+
+
+def _verify_password(password, encoded):
+ """Constant-time verify ``password`` against an encoded hash."""
+ try:
+ algo, iters, b64_salt, b64_hash = encoded.split("$")
+ except ValueError:
+ return False
+ if algo != "pbkdf2-sha256":
+ return False
+ try:
+ iters = int(iters)
+ salt = base64.b64decode(b64_salt)
+ target = base64.b64decode(b64_hash)
+ except Exception:
+ return False
+ dk = hashlib.pbkdf2_hmac(
+ "sha256", password.encode("utf-8"), salt, iters,
+ )
+ return secrets.compare_digest(dk, target)
+
+
+def _generate_api_key():
+ """Return a fresh API-key plaintext of the form ``tg_``."""
+ return API_KEY_PREFIX + secrets.token_urlsafe(API_KEY_RANDOM_BYTES)
+
+
+def _hash_api_key(plaintext):
+ """SHA-256 hex digest of an API key plaintext. Used as the
+ primary key in ``iam_api_keys`` so ``resolve-api-key`` is O(1)."""
+ return hashlib.sha256(plaintext.encode("utf-8")).hexdigest()
+
+
+def _err(type, message):
+ return IamResponse(error=Error(type=type, message=message))
+
+
+def _parse_expires(s):
+ if not s:
+ return None
+ try:
+ return datetime.datetime.fromisoformat(s)
+ except Exception:
+ return None
+
+
+def _b64url(data):
+ """URL-safe base64 encode without padding, as required by JWT."""
+ return base64.urlsafe_b64encode(data).rstrip(b"=").decode("ascii")
+
+
+def _generate_signing_keypair():
+ """Return (kid, private_pem, public_pem) for a fresh Ed25519
+ keypair. Ed25519 / EdDSA: small (32-byte public key), fast,
+ deterministic, side-channel-resistant by construction, free of
+ NIST-curve baggage."""
+ key = ed25519.Ed25519PrivateKey.generate()
+ private_pem = key.private_bytes(
+ encoding=serialization.Encoding.PEM,
+ format=serialization.PrivateFormat.PKCS8,
+ encryption_algorithm=serialization.NoEncryption(),
+ ).decode("ascii")
+ public_pem = key.public_key().public_bytes(
+ encoding=serialization.Encoding.PEM,
+ format=serialization.PublicFormat.SubjectPublicKeyInfo,
+ ).decode("ascii")
+ kid = uuid.uuid4().hex[:16]
+ return kid, private_pem, public_pem
+
+
+def _sign_jwt(kid, private_pem, claims):
+ """Produce a compact-serialisation EdDSA (Ed25519) JWT for
+ ``claims``."""
+ key = serialization.load_pem_private_key(
+ private_pem.encode("ascii"), password=None,
+ )
+ if not isinstance(key, ed25519.Ed25519PrivateKey):
+ raise RuntimeError(
+ f"signing key is not Ed25519: {type(key).__name__}"
+ )
+
+ header = {"alg": "EdDSA", "typ": "JWT", "kid": kid}
+ header_b = _b64url(json.dumps(
+ header, separators=(",", ":"), sort_keys=True,
+ ).encode("utf-8"))
+ payload_b = _b64url(json.dumps(
+ claims, separators=(",", ":"), sort_keys=True,
+ ).encode("utf-8"))
+ signing_input = f"{header_b}.{payload_b}".encode("ascii")
+ signature = key.sign(signing_input)
+
+ return f"{header_b}.{payload_b}.{_b64url(signature)}"
+
+
+class IamService:
+
+ def __init__(self, host, username, password, keyspace,
+ bootstrap_mode, bootstrap_token=None):
+ self.table_store = IamTableStore(
+ host, username, password, keyspace,
+ )
+ # bootstrap_mode: "token" or "bootstrap". In "token" mode the
+ # service auto-seeds on first start using the provided
+ # bootstrap_token and the ``bootstrap`` operation is refused
+ # thereafter (indistinguishable from an already-bootstrapped
+ # deployment per the error policy). In "bootstrap" mode the
+ # ``bootstrap`` operation is live until tables are populated.
+ if bootstrap_mode not in ("token", "bootstrap"):
+ raise ValueError(
+ f"bootstrap_mode must be 'token' or 'bootstrap', "
+ f"got {bootstrap_mode!r}"
+ )
+ if bootstrap_mode == "token" and not bootstrap_token:
+ raise ValueError(
+ "bootstrap_mode='token' requires bootstrap_token"
+ )
+ self.bootstrap_mode = bootstrap_mode
+ self.bootstrap_token = bootstrap_token
+
+ self._signing_key = None
+ self._signing_key_lock = asyncio.Lock()
+
+ # ------------------------------------------------------------------
+ # Dispatch
+ # ------------------------------------------------------------------
+
+ async def handle(self, v):
+ op = v.operation
+
+ try:
+ if op == "bootstrap":
+ return await self.handle_bootstrap(v)
+ if op == "bootstrap-status":
+ return await self.handle_bootstrap_status(v)
+ if op == "whoami":
+ return await self.handle_whoami(v)
+ if op == "resolve-api-key":
+ return await self.handle_resolve_api_key(v)
+ if op == "create-user":
+ return await self.handle_create_user(v)
+ if op == "list-users":
+ return await self.handle_list_users(v)
+ if op == "create-api-key":
+ return await self.handle_create_api_key(v)
+ if op == "list-api-keys":
+ return await self.handle_list_api_keys(v)
+ if op == "revoke-api-key":
+ return await self.handle_revoke_api_key(v)
+ if op == "login":
+ return await self.handle_login(v)
+ if op == "get-signing-key-public":
+ return await self.handle_get_signing_key_public(v)
+ if op == "change-password":
+ return await self.handle_change_password(v)
+ if op == "reset-password":
+ return await self.handle_reset_password(v)
+ if op == "get-user":
+ return await self.handle_get_user(v)
+ if op == "update-user":
+ return await self.handle_update_user(v)
+ if op == "disable-user":
+ return await self.handle_disable_user(v)
+ if op == "enable-user":
+ return await self.handle_enable_user(v)
+ if op == "delete-user":
+ return await self.handle_delete_user(v)
+ if op == "create-workspace":
+ return await self.handle_create_workspace(v)
+ if op == "list-workspaces":
+ return await self.handle_list_workspaces(v)
+ if op == "get-workspace":
+ return await self.handle_get_workspace(v)
+ if op == "update-workspace":
+ return await self.handle_update_workspace(v)
+ if op == "disable-workspace":
+ return await self.handle_disable_workspace(v)
+ if op == "rotate-signing-key":
+ return await self.handle_rotate_signing_key(v)
+ if op == "authorise":
+ return await self.handle_authorise(v)
+ if op == "authorise-many":
+ return await self.handle_authorise_many(v)
+
+ return _err(
+ "invalid-argument",
+ f"unknown or not-yet-implemented operation: {op!r}",
+ )
+
+ except Exception as e:
+ logger.error(
+ f"IAM {op} failed: {type(e).__name__}: {e}",
+ exc_info=True,
+ )
+ return _err("internal-error", str(e))
+
+ # ------------------------------------------------------------------
+ # Record conversion
+ # ------------------------------------------------------------------
+
+ def _row_to_user_record(self, row):
+ (
+ id, workspace, username, name, email, _password_hash,
+ roles, enabled, must_change_password, created,
+ ) = row
+ return UserRecord(
+ id=id or "",
+ workspace=workspace or "",
+ username=username or "",
+ name=name or "",
+ email=email or "",
+ roles=sorted(roles) if roles else [],
+ enabled=bool(enabled),
+ must_change_password=bool(must_change_password),
+ created=_iso(created),
+ )
+
+ def _row_to_api_key_record(self, row):
+ (
+ _key_hash, id, user_id, name, prefix, expires,
+ created, last_used,
+ ) = row
+ return ApiKeyRecord(
+ id=id or "",
+ user_id=user_id or "",
+ name=name or "",
+ prefix=prefix or "",
+ expires=_iso(expires),
+ created=_iso(created),
+ last_used=_iso(last_used),
+ )
+
+ # ------------------------------------------------------------------
+ # bootstrap
+ # ------------------------------------------------------------------
+
+ async def auto_bootstrap_if_token_mode(self):
+ """Called from the service processor at startup. In
+ ``token`` mode, if tables are empty, seeds the default
+ workspace / admin / signing key using the operator-provided
+ bootstrap token. The admin's API key plaintext is *the*
+ ``bootstrap_token`` — the operator already knows it, nothing
+ needs to be returned or logged.
+
+ In ``bootstrap`` mode this is a no-op; seeding happens on
+ explicit ``bootstrap`` operation invocation."""
+ if self.bootstrap_mode != "token":
+ return
+
+ if await self.table_store.any_workspace_exists():
+ logger.info(
+ "IAM: token mode, tables already populated; skipping "
+ "auto-bootstrap"
+ )
+ return
+
+ logger.info("IAM: token mode, empty tables; auto-bootstrapping")
+ await self._seed_tables(self.bootstrap_token)
+ logger.info(
+ "IAM: auto-bootstrap complete using operator-provided token"
+ )
+
+ async def _seed_tables(self, api_key_plaintext):
+ """Shared seeding logic used by token-mode auto-bootstrap and
+ bootstrap-mode handle_bootstrap. Creates the default
+ workspace, admin user, admin API key (using the given
+ plaintext), and an initial signing key. Returns the admin
+ user id."""
+ now = _now_dt()
+
+ await self.table_store.put_workspace(
+ id=DEFAULT_WORKSPACE,
+ name="Default",
+ enabled=True,
+ created=now,
+ )
+
+ admin_user_id = str(uuid.uuid4())
+ admin_password = secrets.token_urlsafe(32)
+ await self.table_store.put_user(
+ id=admin_user_id,
+ workspace=DEFAULT_WORKSPACE,
+ username=BOOTSTRAP_ADMIN_USERNAME,
+ name=BOOTSTRAP_ADMIN_NAME,
+ email="",
+ password_hash=_hash_password(admin_password),
+ roles=["admin"],
+ enabled=True,
+ must_change_password=True,
+ created=now,
+ )
+
+ key_id = str(uuid.uuid4())
+ await self.table_store.put_api_key(
+ key_hash=_hash_api_key(api_key_plaintext),
+ id=key_id,
+ user_id=admin_user_id,
+ name="bootstrap",
+ prefix=api_key_plaintext[:len(API_KEY_PREFIX) + 4],
+ expires=None,
+ created=now,
+ last_used=None,
+ )
+
+ kid, private_pem, public_pem = _generate_signing_keypair()
+ await self.table_store.put_signing_key(
+ kid=kid,
+ private_pem=private_pem,
+ public_pem=public_pem,
+ created=now,
+ retired=None,
+ )
+ self._signing_key = (kid, private_pem, public_pem)
+
+ logger.info(
+ f"IAM seeded: workspace={DEFAULT_WORKSPACE!r}, "
+ f"admin user_id={admin_user_id}, signing key kid={kid}"
+ )
+ return admin_user_id
+
+ async def handle_bootstrap(self, v):
+ """Explicit bootstrap op. Only available in ``bootstrap``
+ mode and only when tables are empty. Every other case is
+ masked to a generic auth failure — the caller cannot
+ distinguish 'not in bootstrap mode' from 'already
+ bootstrapped' from 'operation forbidden'."""
+ if self.bootstrap_mode != "bootstrap":
+ return _err("auth-failed", "auth failure")
+
+ if await self.table_store.any_workspace_exists():
+ return _err("auth-failed", "auth failure")
+
+ plaintext = _generate_api_key()
+ admin_user_id = await self._seed_tables(plaintext)
+
+ return IamResponse(
+ bootstrap_admin_user_id=admin_user_id,
+ bootstrap_admin_api_key=plaintext,
+ )
+
+ async def handle_whoami(self, v):
+ """Return the caller's own user record. ``v.actor`` is the
+ authenticated identity's handle (the gateway populates it
+ from ``identity.handle``). No ``users:read`` capability
+ required — every authenticated user can read themselves."""
+ if not v.actor:
+ return _err(
+ "invalid-argument",
+ "actor required (gateway should populate this)",
+ )
+ user_row = await self.table_store.get_user(v.actor)
+ if user_row is None:
+ return _err("not-found", "user not found")
+ return IamResponse(user=self._row_to_user_record(user_row))
+
+ async def handle_bootstrap_status(self, v):
+ """Probe op: returns whether the deployment is currently in
+ the unconsumed-bootstrap state (i.e. ``bootstrap`` mode with
+ empty tables, where an explicit ``bootstrap`` call would
+ succeed). PUBLIC so a UI can decide whether to render the
+ first-run setup flow without invoking the side-effectful
+ ``bootstrap`` op.
+
+ The information leaked is intentionally narrow: an empty
+ deployment in bootstrap mode is already inferable (no users,
+ no logins succeed); this just makes the answer explicit
+ instead of forcing callers to probe the masked-failure path."""
+ available = (
+ self.bootstrap_mode == "bootstrap"
+ and not await self.table_store.any_workspace_exists()
+ )
+ return IamResponse(bootstrap_available=available)
+
+ # ------------------------------------------------------------------
+ # Signing key helpers
+ # ------------------------------------------------------------------
+
+ async def _get_active_signing_key(self):
+ """Return ``(kid, private_pem, public_pem)`` for the active
+ signing key. Loads from Cassandra on first call. Generates
+ and persists a new key if none exists — covers the case where
+ ``login`` is called before ``bootstrap`` (shouldn't happen in
+ practice but keeps the service internally consistent)."""
+ if self._signing_key is not None:
+ return self._signing_key
+
+ async with self._signing_key_lock:
+ if self._signing_key is not None:
+ return self._signing_key
+
+ rows = await self.table_store.list_signing_keys()
+ active = [r for r in rows if r[4] is None]
+
+ if active:
+ row = active[0]
+ self._signing_key = (row[0], row[1], row[2])
+ logger.info(
+ f"IAM: loaded active signing key kid={row[0]}"
+ )
+ return self._signing_key
+
+ kid, private_pem, public_pem = _generate_signing_keypair()
+ await self.table_store.put_signing_key(
+ kid=kid,
+ private_pem=private_pem,
+ public_pem=public_pem,
+ created=_now_dt(),
+ retired=None,
+ )
+ self._signing_key = (kid, private_pem, public_pem)
+ logger.info(
+ f"IAM: generated active signing key kid={kid} "
+ f"(no existing key found)"
+ )
+ return self._signing_key
+
+ # ------------------------------------------------------------------
+ # login
+ # ------------------------------------------------------------------
+
+ async def handle_login(self, v):
+ if not v.username:
+ return _err("auth-failed", "username required")
+ if not v.password:
+ return _err("auth-failed", "password required")
+
+ # Login accepts an optional workspace parameter. If omitted
+ # we use the default workspace (OSS single-workspace
+ # assumption). Multi-workspace enterprise editions swap in a
+ # resolver that looks across the caller's permitted set.
+ workspace = v.workspace or DEFAULT_WORKSPACE
+
+ user_id = await self.table_store.get_user_id_by_username(
+ workspace, v.username,
+ )
+ if not user_id:
+ return _err("auth-failed", "no such user")
+
+ user_row = await self.table_store.get_user(user_id)
+ if user_row is None:
+ return _err("auth-failed", "user disappeared")
+
+ (
+ id, ws, _username, _name, _email, password_hash,
+ _roles, enabled, _mcp, _created,
+ ) = user_row
+
+ if not enabled:
+ return _err("auth-failed", "user disabled")
+ if not password_hash or not _verify_password(
+ v.password, password_hash,
+ ):
+ return _err("auth-failed", "bad credentials")
+
+ ws_row = await self.table_store.get_workspace(ws)
+ if ws_row is None or not ws_row[2]:
+ return _err("auth-failed", "workspace disabled")
+
+ kid, private_pem, _ = await self._get_active_signing_key()
+
+ now_ts = int(_now_dt().timestamp())
+ exp_ts = now_ts + JWT_TTL_SECONDS
+ # Per the IAM contract the gateway never reads policy state
+ # from the credential — roles stay server-side, reachable
+ # only via authorise(). JWT carries identity + workspace
+ # binding only.
+ claims = {
+ "iss": JWT_ISSUER,
+ "sub": id,
+ "workspace": ws,
+ "iat": now_ts,
+ "exp": exp_ts,
+ }
+ token = _sign_jwt(kid, private_pem, claims)
+
+ expires_iso = datetime.datetime.fromtimestamp(
+ exp_ts, tz=datetime.timezone.utc,
+ ).isoformat()
+
+ return IamResponse(jwt=token, jwt_expires=expires_iso)
+
+ # ------------------------------------------------------------------
+ # get-signing-key-public
+ # ------------------------------------------------------------------
+
+ async def handle_get_signing_key_public(self, v):
+ _, _, public_pem = await self._get_active_signing_key()
+ return IamResponse(signing_key_public=public_pem)
+
+ # ------------------------------------------------------------------
+ # Record-conversion helper for workspaces
+ # ------------------------------------------------------------------
+
+ def _row_to_workspace_record(self, row):
+ id, name, enabled, created = row
+ return WorkspaceRecord(
+ id=id or "",
+ name=name or "",
+ enabled=bool(enabled),
+ created=_iso(created),
+ )
+
+ async def _resolve_user(self, user_id, workspace=None):
+ """Return (user_row, error_response_or_None). Loads the user
+ record by id and (when ``workspace`` is supplied) verifies the
+ record's home workspace matches.
+
+ Workspace is an *optional integrity check* — the user record
+ is system-level, identified by id alone. If the caller asserts
+ a workspace, we verify; if they omit it, we just return the
+ record. Authorisation (whether the caller is permitted to
+ operate on this user) is the gateway's responsibility via the
+ contract's ``authorise`` call before the handler is reached.
+ """
+ user_row = await self.table_store.get_user(user_id)
+ if user_row is None:
+ return None, _err("not-found", "user not found")
+ if workspace and user_row[1] != workspace:
+ return None, _err(
+ "operation-not-permitted",
+ "user is in a different workspace",
+ )
+ return user_row, None
+
+ # ------------------------------------------------------------------
+ # change-password
+ # ------------------------------------------------------------------
+
+ async def handle_change_password(self, v):
+ if not v.user_id:
+ return _err("invalid-argument", "user_id required")
+ if not v.password:
+ return _err("invalid-argument", "password (current) required")
+ if not v.new_password:
+ return _err("invalid-argument", "new_password required")
+
+ user_row = await self.table_store.get_user(v.user_id)
+ if user_row is None:
+ return _err("auth-failed", "no such user")
+
+ _id, _ws, _un, _name, _email, password_hash, _r, enabled, _mcp, _c = (
+ user_row
+ )
+ if not enabled:
+ return _err("auth-failed", "user disabled")
+ if not password_hash or not _verify_password(
+ v.password, password_hash,
+ ):
+ return _err("auth-failed", "bad credentials")
+
+ await self.table_store.update_user_password(
+ id=v.user_id,
+ password_hash=_hash_password(v.new_password),
+ must_change_password=False,
+ )
+ return IamResponse()
+
+ # ------------------------------------------------------------------
+ # reset-password
+ # ------------------------------------------------------------------
+
+ async def handle_reset_password(self, v):
+ if not v.user_id:
+ return _err("invalid-argument", "user_id required")
+
+ _, err = await self._resolve_user(v.user_id, v.workspace or None)
+ if err is not None:
+ return err
+
+ temporary = secrets.token_urlsafe(12)
+ await self.table_store.update_user_password(
+ id=v.user_id,
+ password_hash=_hash_password(temporary),
+ must_change_password=True,
+ )
+ return IamResponse(temporary_password=temporary)
+
+ # ------------------------------------------------------------------
+ # get-user / update-user / disable-user
+ # ------------------------------------------------------------------
+
+ async def handle_get_user(self, v):
+ if not v.user_id:
+ return _err("invalid-argument", "user_id required")
+
+ user_row, err = await self._resolve_user(
+ v.user_id, v.workspace or None,
+ )
+ if err is not None:
+ return err
+ return IamResponse(user=self._row_to_user_record(user_row))
+
+ async def handle_update_user(self, v):
+ """Update user profile fields: name, email, roles, enabled,
+ must_change_password. Username is immutable — change it by
+ creating a new user and disabling the old one. Password
+ changes go through change-password / reset-password."""
+ if not v.user_id:
+ return _err("invalid-argument", "user_id required")
+ if v.user is None:
+ return _err("invalid-argument", "user field required")
+ if v.user.password:
+ return _err(
+ "invalid-argument",
+ "password cannot be changed via update-user; "
+ "use change-password or reset-password",
+ )
+
+ existing, err = await self._resolve_user(
+ v.user_id, v.workspace or None,
+ )
+ if err is not None:
+ return err
+ if v.user.username and v.user.username != existing[2]:
+ return _err(
+ "invalid-argument",
+ "username is immutable; create a new user instead",
+ )
+
+ # Carry forward fields the caller didn't provide.
+ (
+ _id, _ws, _username, cur_name, cur_email, _pw,
+ cur_roles, cur_enabled, cur_mcp, _created,
+ ) = existing
+
+ new_name = v.user.name if v.user.name else cur_name
+ new_email = v.user.email if v.user.email else cur_email
+ new_roles = list(v.user.roles) if v.user.roles else list(
+ cur_roles or [],
+ )
+ new_enabled = v.user.enabled if v.user.enabled is not None else (
+ cur_enabled
+ )
+ new_mcp = (
+ v.user.must_change_password
+ if v.user.must_change_password is not None
+ else cur_mcp
+ )
+
+ await self.table_store.update_user_profile(
+ id=v.user_id,
+ name=new_name,
+ email=new_email,
+ roles=new_roles,
+ enabled=new_enabled,
+ must_change_password=new_mcp,
+ )
+
+ updated = await self.table_store.get_user(v.user_id)
+ return IamResponse(user=self._row_to_user_record(updated))
+
+ async def handle_disable_user(self, v):
+ """Soft-delete: set enabled=false and revoke every API key
+ belonging to the user."""
+ if not v.user_id:
+ return _err("invalid-argument", "user_id required")
+
+ _, err = await self._resolve_user(v.user_id, v.workspace or None)
+ if err is not None:
+ return err
+
+ await self.table_store.update_user_enabled(
+ id=v.user_id, enabled=False,
+ )
+
+ # Revoke all their API keys.
+ key_rows = await self.table_store.list_api_keys_by_user(v.user_id)
+ for kr in key_rows:
+ await self.table_store.delete_api_key(kr[0])
+
+ return IamResponse()
+
+ async def handle_enable_user(self, v):
+ """Re-enable a previously disabled user. Does not restore
+ API keys — those have to be re-issued by the admin."""
+ if not v.user_id:
+ return _err("invalid-argument", "user_id required")
+
+ _, err = await self._resolve_user(v.user_id, v.workspace or None)
+ if err is not None:
+ return err
+
+ await self.table_store.update_user_enabled(
+ id=v.user_id, enabled=True,
+ )
+ return IamResponse()
+
+ async def handle_delete_user(self, v):
+ """Hard-delete a user. Removes the ``iam_users`` row, the
+ ``iam_users_by_username`` lookup row, and every API key
+ belonging to the user.
+
+ Unlike disable, this frees the username for re-use and
+ removes the user's personal data from storage (intended to
+ cover GDPR erasure-style requirements). When audit logging
+ lands, the decision to delete vs. anonymise referenced audit
+ rows will need to be revisited."""
+ if not v.user_id:
+ return _err("invalid-argument", "user_id required")
+
+ user_row, err = await self._resolve_user(
+ v.user_id, v.workspace or None,
+ )
+ if err is not None:
+ return err
+
+ # user_row indices match get_user columns. Username is [2].
+ username = user_row[2]
+ record_workspace = user_row[1]
+
+ # Revoke all API keys.
+ key_rows = await self.table_store.list_api_keys_by_user(v.user_id)
+ for kr in key_rows:
+ await self.table_store.delete_api_key(kr[0])
+
+ # Remove username lookup — keyed on (workspace, username),
+ # so use the resolved workspace from the user record rather
+ # than relying on the caller-supplied filter.
+ if username:
+ await self.table_store.delete_username_lookup(
+ record_workspace, username,
+ )
+
+ # Remove user record.
+ await self.table_store.delete_user(v.user_id)
+
+ return IamResponse()
+
+ # ------------------------------------------------------------------
+ # Workspace CRUD
+ # ------------------------------------------------------------------
+
+ async def handle_create_workspace(self, v):
+ if v.workspace_record is None or not v.workspace_record.id:
+ return _err(
+ "invalid-argument",
+ "workspace_record.id required for create-workspace",
+ )
+ if v.workspace_record.id.startswith("_"):
+ return _err(
+ "invalid-argument",
+ "workspace ids beginning with '_' are reserved",
+ )
+
+ existing = await self.table_store.get_workspace(
+ v.workspace_record.id,
+ )
+ if existing is not None:
+ return _err("duplicate", "workspace already exists")
+
+ now = _now_dt()
+ await self.table_store.put_workspace(
+ id=v.workspace_record.id,
+ name=v.workspace_record.name or v.workspace_record.id,
+ enabled=v.workspace_record.enabled,
+ created=now,
+ )
+ row = await self.table_store.get_workspace(v.workspace_record.id)
+ return IamResponse(workspace=self._row_to_workspace_record(row))
+
+ async def handle_list_workspaces(self, v):
+ rows = await self.table_store.list_workspaces()
+ return IamResponse(
+ workspaces=[
+ self._row_to_workspace_record(r) for r in rows
+ ],
+ )
+
+ async def handle_get_workspace(self, v):
+ if v.workspace_record is None or not v.workspace_record.id:
+ return _err("invalid-argument", "workspace_record.id required")
+ row = await self.table_store.get_workspace(v.workspace_record.id)
+ if row is None:
+ return _err("not-found", "workspace not found")
+ return IamResponse(workspace=self._row_to_workspace_record(row))
+
+ async def handle_update_workspace(self, v):
+ """Update workspace name / enabled. The id is immutable."""
+ if v.workspace_record is None or not v.workspace_record.id:
+ return _err("invalid-argument", "workspace_record.id required")
+ row = await self.table_store.get_workspace(v.workspace_record.id)
+ if row is None:
+ return _err("not-found", "workspace not found")
+
+ _, cur_name, cur_enabled, _created = row
+ new_name = (
+ v.workspace_record.name
+ if v.workspace_record.name else cur_name
+ )
+ new_enabled = (
+ v.workspace_record.enabled
+ if v.workspace_record.enabled is not None
+ else cur_enabled
+ )
+
+ await self.table_store.update_workspace(
+ id=v.workspace_record.id,
+ name=new_name,
+ enabled=new_enabled,
+ )
+ updated = await self.table_store.get_workspace(
+ v.workspace_record.id,
+ )
+ return IamResponse(
+ workspace=self._row_to_workspace_record(updated),
+ )
+
+ async def handle_disable_workspace(self, v):
+ """Set enabled=false, disable every user in the workspace,
+ revoke every API key belonging to those users."""
+ if v.workspace_record is None or not v.workspace_record.id:
+ return _err("invalid-argument", "workspace_record.id required")
+
+ row = await self.table_store.get_workspace(v.workspace_record.id)
+ if row is None:
+ return _err("not-found", "workspace not found")
+
+ await self.table_store.update_workspace(
+ id=v.workspace_record.id,
+ name=row[1] or v.workspace_record.id,
+ enabled=False,
+ )
+
+ user_rows = await self.table_store.list_users_by_workspace(
+ v.workspace_record.id,
+ )
+ for ur in user_rows:
+ user_id = ur[0]
+ await self.table_store.update_user_enabled(
+ id=user_id, enabled=False,
+ )
+ key_rows = await self.table_store.list_api_keys_by_user(user_id)
+ for kr in key_rows:
+ await self.table_store.delete_api_key(kr[0])
+
+ return IamResponse()
+
+ # ------------------------------------------------------------------
+ # rotate-signing-key
+ # ------------------------------------------------------------------
+
+ async def handle_rotate_signing_key(self, v):
+ """Create a new Ed25519 signing key, retire the current
+ active key, switch the in-memory cache over.
+
+ The retired key row is kept in ``iam_signing_keys`` so the
+ gateway's JWT validator can continue to validate previously-
+ issued tokens during the grace period. Actual grace-period
+ enforcement (time-window acceptance at the validator) lands
+ with the gateway auth middleware work."""
+
+ # Retire the currently-active key, if any.
+ current = await self._get_active_signing_key()
+ now = _now_dt()
+ if current is not None:
+ cur_kid, _cur_priv, _cur_pub = current
+ await self.table_store.retire_signing_key(
+ kid=cur_kid, retired=now,
+ )
+
+ new_kid, new_priv, new_pub = _generate_signing_keypair()
+ await self.table_store.put_signing_key(
+ kid=new_kid,
+ private_pem=new_priv,
+ public_pem=new_pub,
+ created=now,
+ retired=None,
+ )
+ self._signing_key = (new_kid, new_priv, new_pub)
+ logger.info(
+ f"IAM: rotated signing key. "
+ f"New kid={new_kid}, retired kid={(current or (None,))[0]}"
+ )
+ return IamResponse()
+
+ # ------------------------------------------------------------------
+ # resolve-api-key
+ # ------------------------------------------------------------------
+
+ async def handle_resolve_api_key(self, v):
+ if not v.api_key:
+ return _err("auth-failed", "no api key")
+
+ row = await self.table_store.get_api_key_by_hash(
+ _hash_api_key(v.api_key),
+ )
+ if row is None:
+ return _err("auth-failed", "unknown api key")
+
+ (
+ _key_hash, _id, user_id, _name, _prefix, expires,
+ _created, _last_used,
+ ) = row
+
+ if expires is not None:
+ exp_dt = expires
+ if isinstance(exp_dt, str):
+ exp_dt = datetime.datetime.fromisoformat(exp_dt)
+ if exp_dt.tzinfo is None:
+ exp_dt = exp_dt.replace(tzinfo=datetime.timezone.utc)
+ if exp_dt < _now_dt():
+ return _err("auth-failed", "api key expired")
+
+ user_row = await self.table_store.get_user(user_id)
+ if user_row is None:
+ return _err("auth-failed", "owning user missing")
+ user = self._row_to_user_record(user_row)
+ if not user.enabled:
+ return _err("auth-failed", "owning user disabled")
+
+ # Workspace-disabled check.
+ ws_row = await self.table_store.get_workspace(user.workspace)
+ if ws_row is None or not ws_row[2]:
+ return _err("auth-failed", "owning workspace disabled")
+
+ return IamResponse(
+ resolved_user_id=user.id,
+ resolved_workspace=user.workspace,
+ resolved_roles=list(user.roles),
+ )
+
+ # ------------------------------------------------------------------
+ # create-user
+ # ------------------------------------------------------------------
+
+ async def handle_create_user(self, v):
+ if not v.workspace:
+ return _err(
+ "invalid-argument", "workspace required for create-user",
+ )
+ if v.user is None:
+ return _err(
+ "invalid-argument", "user field required for create-user",
+ )
+ if not v.user.username:
+ return _err("invalid-argument", "user.username required")
+ if not v.user.password:
+ return _err("invalid-argument", "user.password required")
+
+ # Workspace must exist and be enabled.
+ ws = await self.table_store.get_workspace(v.workspace)
+ if ws is None or not ws[2]:
+ return _err("not-found", "workspace not found or disabled")
+
+ # Uniqueness on username within workspace.
+ existing = await self.table_store.get_user_id_by_username(
+ v.workspace, v.user.username,
+ )
+ if existing:
+ return _err("duplicate", "username already exists")
+
+ user_id = str(uuid.uuid4())
+ now = _now_dt()
+
+ await self.table_store.put_user(
+ id=user_id,
+ workspace=v.workspace,
+ username=v.user.username,
+ name=v.user.name or v.user.username,
+ email=v.user.email or "",
+ password_hash=_hash_password(v.user.password),
+ roles=list(v.user.roles or []),
+ enabled=v.user.enabled,
+ must_change_password=v.user.must_change_password,
+ created=now,
+ )
+
+ row = await self.table_store.get_user(user_id)
+ return IamResponse(user=self._row_to_user_record(row))
+
+ # ------------------------------------------------------------------
+ # list-users
+ # ------------------------------------------------------------------
+
+ async def handle_list_users(self, v):
+ # System-level operation: workspace, when supplied, is a
+ # filter on the user record's home-workspace association.
+ # Empty workspace returns the deployment-wide list — the
+ # gateway has already authorised the caller's authority to
+ # see that scope.
+ if v.workspace:
+ rows = await self.table_store.list_users_by_workspace(v.workspace)
+ else:
+ rows = await self.table_store.list_users()
+ return IamResponse(
+ users=[self._row_to_user_record(r) for r in rows],
+ )
+
+ # ------------------------------------------------------------------
+ # create-api-key
+ # ------------------------------------------------------------------
+
+ async def handle_create_api_key(self, v):
+ if v.key is None or not v.key.user_id:
+ return _err("invalid-argument", "key.user_id required")
+ if not v.key.name:
+ return _err("invalid-argument", "key.name required")
+
+ # API keys are system-level records with a workspace
+ # association (the user's home workspace). Workspace is an
+ # optional integrity check on the caller's request — when
+ # supplied it must match the target user's home workspace;
+ # when omitted, the user's home workspace is used.
+ user_row, err = await self._resolve_user(
+ v.key.user_id, v.workspace or None,
+ )
+ if err is not None:
+ return err
+
+ plaintext = _generate_api_key()
+ key_id = str(uuid.uuid4())
+ now = _now_dt()
+ expires_dt = _parse_expires(v.key.expires)
+
+ await self.table_store.put_api_key(
+ key_hash=_hash_api_key(plaintext),
+ id=key_id,
+ user_id=v.key.user_id,
+ name=v.key.name,
+ prefix=plaintext[:len(API_KEY_PREFIX) + 4],
+ expires=expires_dt,
+ created=now,
+ last_used=None,
+ )
+
+ row = await self.table_store.get_api_key_by_hash(
+ _hash_api_key(plaintext),
+ )
+ return IamResponse(
+ api_key_plaintext=plaintext,
+ api_key=self._row_to_api_key_record(row),
+ )
+
+ # ------------------------------------------------------------------
+ # list-api-keys
+ # ------------------------------------------------------------------
+
+ async def handle_list_api_keys(self, v):
+ if not v.user_id:
+ return _err(
+ "invalid-argument", "user_id required for list-api-keys",
+ )
+
+ # Workspace is an optional integrity check.
+ _, err = await self._resolve_user(v.user_id, v.workspace or None)
+ if err is not None:
+ return err
+
+ rows = await self.table_store.list_api_keys_by_user(v.user_id)
+ return IamResponse(
+ api_keys=[self._row_to_api_key_record(r) for r in rows],
+ )
+
+ # ------------------------------------------------------------------
+ # revoke-api-key
+ # ------------------------------------------------------------------
+
+ async def handle_revoke_api_key(self, v):
+ if not v.key_id:
+ return _err("invalid-argument", "key_id required")
+
+ row = await self.table_store.get_api_key_by_id(v.key_id)
+ if row is None:
+ return _err("not-found", "api key not found")
+
+ key_hash, _id, user_id, _name, _prefix, _expires, _c, _lu = row
+
+ # Workspace is an optional integrity check via the owning user.
+ if v.workspace:
+ user_row = await self.table_store.get_user(user_id)
+ if user_row is None or user_row[1] != v.workspace:
+ return _err(
+ "operation-not-permitted",
+ "key belongs to a different workspace",
+ )
+
+ await self.table_store.delete_api_key(key_hash)
+ return IamResponse()
+
+ # ------------------------------------------------------------------
+ # authorise / authorise-many
+ #
+ # The IAM contract (see docs/tech-specs/iam-contract.md) calls
+ # for the regime — not the gateway — to decide whether an
+ # identity may perform a capability on a resource given the
+ # operation's parameters. These two operations are the OSS
+ # regime's implementation of that contract.
+ #
+ # Inputs (on IamRequest):
+ # user_id — the identity handle (the gateway's
+ # opaque reference). For OSS this is the
+ # user record's id.
+ # capability — the capability string from the
+ # capabilities.md vocabulary.
+ # resource_json — JSON dict, the resource address
+ # ({} for system, {workspace} for
+ # workspace, {workspace, flow} for flow).
+ # parameters_json — JSON dict, decision-relevant operation
+ # parameters (e.g. workspace association
+ # on user-registry operations).
+ # authorise_checks — for authorise-many, a JSON list of
+ # {capability, resource, parameters}.
+ #
+ # Outputs (on IamResponse):
+ # decision_allow — single allow / deny verdict.
+ # decision_ttl_seconds — gateway cache TTL for this
+ # decision.
+ # decisions_json — for authorise-many, list of
+ # {allow, ttl} in request order.
+ # ------------------------------------------------------------------
+
+ def _decide(self, user_row, capability, resource, parameters):
+ """Single authorisation decision. Returns (allow, ttl)."""
+
+ if user_row is None:
+ return False, AUTHZ_CACHE_TTL_SECONDS
+
+ # user_row layout:
+ # 0:id 1:workspace 2:username 3:name 4:email 5:password_hash
+ # 6:roles 7:enabled 8:must_change_password 9:created
+ if not user_row[7]: # disabled
+ return False, AUTHZ_CACHE_TTL_SECONDS
+
+ # Disabled workspace check (defense in depth — credentials
+ # bound to a disabled workspace shouldn't be able to act).
+ # Cheap; one row read.
+ # We do this only when a target workspace is involved, to
+ # avoid an extra read for system-level operations that
+ # bypass workspace altogether.
+ target_workspace = (
+ (resource or {}).get("workspace")
+ or (parameters or {}).get("workspace")
+ )
+
+ roles = user_row[6] or set()
+ assigned_workspace = user_row[1]
+
+ for role_name in roles:
+ defn = ROLE_DEFINITIONS.get(role_name)
+ if defn is None:
+ continue
+ if capability not in defn["capabilities"]:
+ continue
+ if target_workspace is None or _scope_permits(
+ defn["workspace_scope"],
+ target_workspace,
+ assigned_workspace,
+ ):
+ return True, AUTHZ_CACHE_TTL_SECONDS
+
+ return False, AUTHZ_CACHE_TTL_SECONDS
+
+ async def handle_authorise(self, v):
+ if not v.capability:
+ return _err("invalid-argument", "capability required")
+ if not v.user_id:
+ return _err("invalid-argument", "user_id (handle) required")
+
+ try:
+ resource = json.loads(v.resource_json or "{}")
+ parameters = json.loads(v.parameters_json or "{}")
+ except json.JSONDecodeError as e:
+ return _err("invalid-argument", f"bad json: {e}")
+
+ user_row = await self.table_store.get_user(v.user_id)
+ allow, ttl = self._decide(
+ user_row, v.capability, resource, parameters,
+ )
+ return IamResponse(
+ decision_allow=allow,
+ decision_ttl_seconds=ttl,
+ )
+
+ async def handle_authorise_many(self, v):
+ if not v.user_id:
+ return _err("invalid-argument", "user_id (handle) required")
+ if not v.authorise_checks:
+ return _err("invalid-argument", "authorise_checks required")
+
+ try:
+ checks = json.loads(v.authorise_checks)
+ except json.JSONDecodeError as e:
+ return _err("invalid-argument", f"bad json: {e}")
+ if not isinstance(checks, list):
+ return _err(
+ "invalid-argument",
+ "authorise_checks must be a JSON list",
+ )
+
+ # One user lookup for the whole batch.
+ user_row = await self.table_store.get_user(v.user_id)
+
+ decisions = []
+ for c in checks:
+ if not isinstance(c, dict):
+ decisions.append({
+ "allow": False,
+ "ttl": AUTHZ_CACHE_TTL_SECONDS,
+ })
+ continue
+ allow, ttl = self._decide(
+ user_row,
+ c.get("capability", ""),
+ c.get("resource") or {},
+ c.get("parameters") or {},
+ )
+ decisions.append({"allow": allow, "ttl": ttl})
+
+ return IamResponse(decisions_json=json.dumps(decisions))
diff --git a/trustgraph-flow/trustgraph/iam/service/service.py b/trustgraph-flow/trustgraph/iam/service/service.py
new file mode 100644
index 00000000..147bd56a
--- /dev/null
+++ b/trustgraph-flow/trustgraph/iam/service/service.py
@@ -0,0 +1,233 @@
+"""
+IAM service processor. Terminates the IAM request queue and forwards
+each request to the IamService business logic, then returns the
+response on the IAM response queue.
+
+Shape mirrors trustgraph.config.service.
+"""
+
+import logging
+import os
+
+from trustgraph.schema import Error
+from trustgraph.schema import IamRequest, IamResponse
+from trustgraph.schema import iam_request_queue, iam_response_queue
+
+from trustgraph.base import AsyncProcessor, Consumer, Producer
+from trustgraph.base import ConsumerMetrics, ProducerMetrics
+from trustgraph.base.cassandra_config import (
+ add_cassandra_args, resolve_cassandra_config,
+)
+
+from . iam import IamService
+
+logger = logging.getLogger(__name__)
+
+default_ident = "iam-svc"
+
+default_iam_request_queue = iam_request_queue
+default_iam_response_queue = iam_response_queue
+
+# Environment variables consulted as a fallback when the
+# corresponding params field is not set in the processor-group YAML
+# or via CLI. Intended for K8s Secret / env-var injection so the
+# bootstrap token never has to live in the YAML (and thus in git).
+ENV_BOOTSTRAP_MODE = "IAM_BOOTSTRAP_MODE"
+ENV_BOOTSTRAP_TOKEN = "IAM_BOOTSTRAP_TOKEN"
+
+
+class Processor(AsyncProcessor):
+
+ def __init__(self, **params):
+
+ iam_req_q = params.get(
+ "iam_request_queue", default_iam_request_queue,
+ )
+ iam_resp_q = params.get(
+ "iam_response_queue", default_iam_response_queue,
+ )
+
+ # Resolve bootstrap mode + token. Precedence: explicit
+ # params (CLI / processor-group YAML) → environment variable
+ # → unset (fail-closed). The env-var path is the K8s-native
+ # injection point: an `IAM_BOOTSTRAP_TOKEN` from a Secret
+ # never has to land in the YAML, and therefore never enters
+ # git history.
+ bootstrap_mode = (
+ params.get("bootstrap_mode")
+ or os.environ.get(ENV_BOOTSTRAP_MODE)
+ )
+ bootstrap_token = (
+ params.get("bootstrap_token")
+ or os.environ.get(ENV_BOOTSTRAP_TOKEN)
+ )
+
+ if bootstrap_mode not in ("token", "bootstrap"):
+ raise RuntimeError(
+ "iam-svc: bootstrap-mode is required. Set to 'token' "
+ "(with bootstrap-token) for production, or 'bootstrap' "
+ "to enable the explicit bootstrap operation over the "
+ "pub/sub bus (dev / quick-start only, not safe under "
+ "public exposure). Configurable via processor-group "
+ f"params or the {ENV_BOOTSTRAP_MODE} environment "
+ "variable. Refusing to start."
+ )
+ if bootstrap_mode == "token" and not bootstrap_token:
+ raise RuntimeError(
+ "iam-svc: bootstrap-mode=token requires bootstrap-token "
+ f"(or the {ENV_BOOTSTRAP_TOKEN} environment "
+ "variable). Refusing to start."
+ )
+ if bootstrap_mode == "bootstrap" and bootstrap_token:
+ raise RuntimeError(
+ "iam-svc: bootstrap-token is not accepted when "
+ "bootstrap-mode=bootstrap. Ambiguous intent. "
+ "Refusing to start."
+ )
+
+ self.bootstrap_mode = bootstrap_mode
+ self.bootstrap_token = bootstrap_token
+
+ cassandra_host = params.get("cassandra_host")
+ cassandra_username = params.get("cassandra_username")
+ cassandra_password = params.get("cassandra_password")
+
+ hosts, username, password, keyspace = resolve_cassandra_config(
+ host=cassandra_host,
+ username=cassandra_username,
+ password=cassandra_password,
+ default_keyspace="iam",
+ )
+
+ self.cassandra_host = hosts
+ self.cassandra_username = username
+ self.cassandra_password = password
+
+ super().__init__(
+ **params | {
+ "iam_request_schema": IamRequest.__name__,
+ "iam_response_schema": IamResponse.__name__,
+ "cassandra_host": self.cassandra_host,
+ "cassandra_username": self.cassandra_username,
+ "cassandra_password": self.cassandra_password,
+ }
+ )
+
+ iam_request_metrics = ConsumerMetrics(
+ processor=self.id, flow=None, name="iam-request",
+ )
+ iam_response_metrics = ProducerMetrics(
+ processor=self.id, flow=None, name="iam-response",
+ )
+
+ self.iam_request_topic = iam_req_q
+
+ self.iam_request_consumer = Consumer(
+ taskgroup=self.taskgroup,
+ backend=self.pubsub,
+ flow=None,
+ topic=iam_req_q,
+ subscriber=self.id,
+ schema=IamRequest,
+ handler=self.on_iam_request,
+ metrics=iam_request_metrics,
+ )
+
+ self.iam_response_producer = Producer(
+ backend=self.pubsub,
+ topic=iam_resp_q,
+ schema=IamResponse,
+ metrics=iam_response_metrics,
+ )
+
+ self.iam = IamService(
+ host=self.cassandra_host,
+ username=self.cassandra_username,
+ password=self.cassandra_password,
+ keyspace=keyspace,
+ bootstrap_mode=self.bootstrap_mode,
+ bootstrap_token=self.bootstrap_token,
+ )
+
+ logger.info(
+ f"IAM service initialised (bootstrap-mode={self.bootstrap_mode})"
+ )
+
+ async def start(self):
+ await self.pubsub.ensure_topic(self.iam_request_topic)
+ # Token-mode auto-bootstrap runs before we accept requests so
+ # the first inbound call always sees a populated table.
+ await self.iam.auto_bootstrap_if_token_mode()
+ await self.iam_request_consumer.start()
+
+ async def on_iam_request(self, msg, consumer, flow):
+
+ id = None
+ try:
+ v = msg.value()
+ id = msg.properties()["id"]
+ logger.debug(
+ f"Handling IAM request {id} op={v.operation!r}"
+ )
+ resp = await self.iam.handle(v)
+ await self.iam_response_producer.send(
+ resp, properties={"id": id},
+ )
+ except Exception as e:
+ logger.error(
+ f"IAM request failed: {type(e).__name__}: {e}",
+ exc_info=True,
+ )
+ resp = IamResponse(
+ error=Error(type="internal-error", message=str(e)),
+ )
+ if id is not None:
+ await self.iam_response_producer.send(
+ resp, properties={"id": id},
+ )
+
+ @staticmethod
+ def add_args(parser):
+ AsyncProcessor.add_args(parser)
+
+ parser.add_argument(
+ "--iam-request-queue",
+ default=default_iam_request_queue,
+ help=f"IAM request queue (default: {default_iam_request_queue})",
+ )
+ parser.add_argument(
+ "--iam-response-queue",
+ default=default_iam_response_queue,
+ help=f"IAM response queue (default: {default_iam_response_queue})",
+ )
+ parser.add_argument(
+ "--bootstrap-mode",
+ default=None,
+ choices=["token", "bootstrap"],
+ help=(
+ "IAM bootstrap mode (required). "
+ "'token' = operator supplies the initial admin API "
+ "key via --bootstrap-token; auto-seeds on first start, "
+ "bootstrap operation refused. "
+ "'bootstrap' = bootstrap operation is live over the "
+ "bus until tables are populated; a token is generated "
+ "and returned by tg-bootstrap-iam. Unsafe to run "
+ "'bootstrap' mode with public exposure."
+ ),
+ )
+ parser.add_argument(
+ "--bootstrap-token",
+ default=None,
+ help=(
+ "Initial admin API key plaintext, required when "
+ "--bootstrap-mode=token. Treat as a one-time "
+ "credential: the operator should rotate to a new key "
+ "and revoke this one after first use."
+ ),
+ )
+
+ add_cassandra_args(parser)
+
+
+def run():
+ Processor.launch(default_ident, __doc__)
diff --git a/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py b/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py
index f6c5dcb8..2e537fde 100755
--- a/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py
@@ -4,7 +4,7 @@ Simple LLM service, performs text prompt completion using an Ollama service.
Input is prompt, output is response.
"""
-from ollama import Client
+from ollama import AsyncClient
import os
import logging
@@ -38,23 +38,23 @@ class Processor(LlmService):
self.default_model = model
self.temperature = temperature
- self.llm = Client(host=ollama)
+ self.llm = AsyncClient(host=ollama)
self._checked_models = set()
- def _ensure_model(self, model_name):
+ async def _ensure_model(self, model_name):
"""Check if model exists locally, pull it if not."""
if model_name in self._checked_models:
return
try:
- self.llm.show(model_name)
+ await self.llm.show(model_name)
self._checked_models.add(model_name)
except Exception as e:
status_code = getattr(e, 'status_code', None)
if status_code == 404 or "not found" in str(e).lower():
logger.info(f"Ollama model '{model_name}' not found locally. Pulling, this may take a while...")
try:
- self.llm.pull(model_name)
+ await self.llm.pull(model_name)
self._checked_models.add(model_name)
logger.info(f"Successfully pulled Ollama model '{model_name}'.")
except Exception as pull_e:
@@ -66,9 +66,9 @@ class Processor(LlmService):
# Use provided model or fall back to default
model_name = model or self.default_model
-
+
# Ensure the model exists/is pulled
- self._ensure_model(model_name)
+ await self._ensure_model(model_name)
# Use provided temperature or fall back to default
effective_temperature = temperature if temperature is not None else self.temperature
@@ -79,7 +79,7 @@ class Processor(LlmService):
try:
- response = self.llm.generate(model_name, prompt, options={'temperature': effective_temperature})
+ response = await self.llm.generate(model_name, prompt, options={'temperature': effective_temperature})
response_text = response['response']
logger.debug("Sending response...")
@@ -113,7 +113,7 @@ class Processor(LlmService):
model_name = model or self.default_model
# Ensure the model exists/is pulled
- self._ensure_model(model_name)
+ await self._ensure_model(model_name)
effective_temperature = temperature if temperature is not None else self.temperature
@@ -123,7 +123,7 @@ class Processor(LlmService):
prompt = system + "\n\n" + prompt
try:
- stream = self.llm.generate(
+ stream = await self.llm.generate(
model_name,
prompt,
options={'temperature': effective_temperature},
@@ -133,7 +133,7 @@ class Processor(LlmService):
total_input_tokens = 0
total_output_tokens = 0
- for chunk in stream:
+ async for chunk in stream:
if 'response' in chunk and chunk['response']:
yield LlmChunk(
text=chunk['response'],
diff --git a/trustgraph-flow/trustgraph/tables/iam.py b/trustgraph-flow/trustgraph/tables/iam.py
new file mode 100644
index 00000000..f1a0734f
--- /dev/null
+++ b/trustgraph-flow/trustgraph/tables/iam.py
@@ -0,0 +1,436 @@
+"""
+IAM Cassandra table store.
+
+Tables:
+ - iam_workspaces (id primary key)
+ - iam_users (id primary key) + iam_users_by_username lookup table
+ (workspace, username) -> id
+ - iam_api_keys (key_hash primary key) with secondary index on user_id
+ - iam_signing_keys (kid primary key) — RSA keypairs for JWT signing
+
+See docs/tech-specs/iam-protocol.md for the wire-level context.
+"""
+
+import logging
+
+from cassandra.cluster import Cluster
+from cassandra.auth import PlainTextAuthProvider
+from ssl import SSLContext, PROTOCOL_TLSv1_2
+
+from . cassandra_async import async_execute
+
+logger = logging.getLogger(__name__)
+
+
+class IamTableStore:
+
+ def __init__(
+ self,
+ cassandra_host, cassandra_username, cassandra_password,
+ keyspace,
+ ):
+ self.keyspace = keyspace
+
+ logger.info("IAM: connecting to Cassandra...")
+
+ if isinstance(cassandra_host, str):
+ cassandra_host = [h.strip() for h in cassandra_host.split(",")]
+
+ if cassandra_username and cassandra_password:
+ ssl_context = SSLContext(PROTOCOL_TLSv1_2)
+ auth_provider = PlainTextAuthProvider(
+ username=cassandra_username, password=cassandra_password,
+ )
+ self.cluster = Cluster(
+ cassandra_host,
+ auth_provider=auth_provider,
+ ssl_context=ssl_context,
+ )
+ else:
+ self.cluster = Cluster(cassandra_host)
+
+ self.cassandra = self.cluster.connect()
+
+ logger.info("IAM: connected.")
+
+ self._ensure_schema()
+ self._prepare_statements()
+
+ def _ensure_schema(self):
+ # FIXME: Replication factor should be configurable.
+ self.cassandra.execute(f"""
+ create keyspace if not exists {self.keyspace}
+ with replication = {{
+ 'class' : 'SimpleStrategy',
+ 'replication_factor' : 1
+ }};
+ """)
+ self.cassandra.set_keyspace(self.keyspace)
+
+ self.cassandra.execute("""
+ CREATE TABLE IF NOT EXISTS iam_workspaces (
+ id text PRIMARY KEY,
+ name text,
+ enabled boolean,
+ created timestamp
+ );
+ """)
+
+ self.cassandra.execute("""
+ CREATE TABLE IF NOT EXISTS iam_users (
+ id text PRIMARY KEY,
+ workspace text,
+ username text,
+ name text,
+ email text,
+ password_hash text,
+ roles set,
+ enabled boolean,
+ must_change_password boolean,
+ created timestamp
+ );
+ """)
+
+ self.cassandra.execute("""
+ CREATE TABLE IF NOT EXISTS iam_users_by_username (
+ workspace text,
+ username text,
+ user_id text,
+ PRIMARY KEY ((workspace), username)
+ );
+ """)
+
+ self.cassandra.execute("""
+ CREATE TABLE IF NOT EXISTS iam_api_keys (
+ key_hash text PRIMARY KEY,
+ id text,
+ user_id text,
+ name text,
+ prefix text,
+ expires timestamp,
+ created timestamp,
+ last_used timestamp
+ );
+ """)
+
+ self.cassandra.execute("""
+ CREATE INDEX IF NOT EXISTS iam_api_keys_user_id_idx
+ ON iam_api_keys (user_id);
+ """)
+
+ self.cassandra.execute("""
+ CREATE INDEX IF NOT EXISTS iam_api_keys_id_idx
+ ON iam_api_keys (id);
+ """)
+
+ self.cassandra.execute("""
+ CREATE TABLE IF NOT EXISTS iam_signing_keys (
+ kid text PRIMARY KEY,
+ private_pem text,
+ public_pem text,
+ created timestamp,
+ retired timestamp
+ );
+ """)
+
+ logger.info("IAM: Cassandra schema OK.")
+
+ def _prepare_statements(self):
+ c = self.cassandra
+
+ self.put_workspace_stmt = c.prepare("""
+ INSERT INTO iam_workspaces (id, name, enabled, created)
+ VALUES (?, ?, ?, ?)
+ """)
+ self.get_workspace_stmt = c.prepare("""
+ SELECT id, name, enabled, created FROM iam_workspaces
+ WHERE id = ?
+ """)
+ self.list_workspaces_stmt = c.prepare("""
+ SELECT id, name, enabled, created FROM iam_workspaces
+ """)
+
+ self.put_user_stmt = c.prepare("""
+ INSERT INTO iam_users (
+ id, workspace, username, name, email, password_hash,
+ roles, enabled, must_change_password, created
+ )
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ """)
+ self.get_user_stmt = c.prepare("""
+ SELECT id, workspace, username, name, email, password_hash,
+ roles, enabled, must_change_password, created
+ FROM iam_users WHERE id = ?
+ """)
+ self.list_users_by_workspace_stmt = c.prepare("""
+ SELECT id, workspace, username, name, email, password_hash,
+ roles, enabled, must_change_password, created
+ FROM iam_users WHERE workspace = ? ALLOW FILTERING
+ """)
+ self.list_users_stmt = c.prepare("""
+ SELECT id, workspace, username, name, email, password_hash,
+ roles, enabled, must_change_password, created
+ FROM iam_users
+ """)
+
+ self.put_username_lookup_stmt = c.prepare("""
+ INSERT INTO iam_users_by_username (workspace, username, user_id)
+ VALUES (?, ?, ?)
+ """)
+ self.get_user_id_by_username_stmt = c.prepare("""
+ SELECT user_id FROM iam_users_by_username
+ WHERE workspace = ? AND username = ?
+ """)
+ self.delete_username_lookup_stmt = c.prepare("""
+ DELETE FROM iam_users_by_username
+ WHERE workspace = ? AND username = ?
+ """)
+ self.delete_user_stmt = c.prepare("""
+ DELETE FROM iam_users WHERE id = ?
+ """)
+
+ self.put_api_key_stmt = c.prepare("""
+ INSERT INTO iam_api_keys (
+ key_hash, id, user_id, name, prefix, expires,
+ created, last_used
+ )
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+ """)
+ self.get_api_key_by_hash_stmt = c.prepare("""
+ SELECT key_hash, id, user_id, name, prefix, expires,
+ created, last_used
+ FROM iam_api_keys WHERE key_hash = ?
+ """)
+ self.get_api_key_by_id_stmt = c.prepare("""
+ SELECT key_hash, id, user_id, name, prefix, expires,
+ created, last_used
+ FROM iam_api_keys WHERE id = ?
+ """)
+ self.list_api_keys_by_user_stmt = c.prepare("""
+ SELECT key_hash, id, user_id, name, prefix, expires,
+ created, last_used
+ FROM iam_api_keys WHERE user_id = ?
+ """)
+ self.delete_api_key_stmt = c.prepare("""
+ DELETE FROM iam_api_keys WHERE key_hash = ?
+ """)
+
+ self.put_signing_key_stmt = c.prepare("""
+ INSERT INTO iam_signing_keys (
+ kid, private_pem, public_pem, created, retired
+ )
+ VALUES (?, ?, ?, ?, ?)
+ """)
+ self.list_signing_keys_stmt = c.prepare("""
+ SELECT kid, private_pem, public_pem, created, retired
+ FROM iam_signing_keys
+ """)
+ self.retire_signing_key_stmt = c.prepare("""
+ UPDATE iam_signing_keys SET retired = ? WHERE kid = ?
+ """)
+
+ self.update_user_profile_stmt = c.prepare("""
+ UPDATE iam_users
+ SET name = ?, email = ?, roles = ?, enabled = ?,
+ must_change_password = ?
+ WHERE id = ?
+ """)
+ self.update_user_password_stmt = c.prepare("""
+ UPDATE iam_users
+ SET password_hash = ?, must_change_password = ?
+ WHERE id = ?
+ """)
+ self.update_user_enabled_stmt = c.prepare("""
+ UPDATE iam_users SET enabled = ? WHERE id = ?
+ """)
+
+ self.update_workspace_stmt = c.prepare("""
+ UPDATE iam_workspaces SET name = ?, enabled = ?
+ WHERE id = ?
+ """)
+
+ # ------------------------------------------------------------------
+ # Workspaces
+ # ------------------------------------------------------------------
+
+ async def put_workspace(self, id, name, enabled, created):
+ await async_execute(
+ self.cassandra, self.put_workspace_stmt,
+ (id, name, enabled, created),
+ )
+
+ async def get_workspace(self, id):
+ rows = await async_execute(
+ self.cassandra, self.get_workspace_stmt, (id,),
+ )
+ return rows[0] if rows else None
+
+ async def list_workspaces(self):
+ return await async_execute(
+ self.cassandra, self.list_workspaces_stmt,
+ )
+
+ # ------------------------------------------------------------------
+ # Users
+ # ------------------------------------------------------------------
+
+ async def put_user(
+ self, id, workspace, username, name, email, password_hash,
+ roles, enabled, must_change_password, created,
+ ):
+ await async_execute(
+ self.cassandra, self.put_user_stmt,
+ (
+ id, workspace, username, name, email, password_hash,
+ set(roles) if roles else set(),
+ enabled, must_change_password, created,
+ ),
+ )
+ await async_execute(
+ self.cassandra, self.put_username_lookup_stmt,
+ (workspace, username, id),
+ )
+
+ async def get_user(self, id):
+ rows = await async_execute(
+ self.cassandra, self.get_user_stmt, (id,),
+ )
+ return rows[0] if rows else None
+
+ async def get_user_id_by_username(self, workspace, username):
+ rows = await async_execute(
+ self.cassandra, self.get_user_id_by_username_stmt,
+ (workspace, username),
+ )
+ return rows[0][0] if rows else None
+
+ async def list_users_by_workspace(self, workspace):
+ return await async_execute(
+ self.cassandra, self.list_users_by_workspace_stmt, (workspace,),
+ )
+
+ async def list_users(self):
+ """List every user across the deployment. Used by the
+ system-level list-users handler when no workspace filter is
+ supplied; the gateway has already authorised the call against
+ the caller's authority."""
+ return await async_execute(
+ self.cassandra, self.list_users_stmt, (),
+ )
+
+ async def delete_user(self, id):
+ await async_execute(
+ self.cassandra, self.delete_user_stmt, (id,),
+ )
+
+ async def delete_username_lookup(self, workspace, username):
+ await async_execute(
+ self.cassandra, self.delete_username_lookup_stmt,
+ (workspace, username),
+ )
+
+ # ------------------------------------------------------------------
+ # API keys
+ # ------------------------------------------------------------------
+
+ async def put_api_key(
+ self, key_hash, id, user_id, name, prefix, expires,
+ created, last_used,
+ ):
+ await async_execute(
+ self.cassandra, self.put_api_key_stmt,
+ (key_hash, id, user_id, name, prefix, expires,
+ created, last_used),
+ )
+
+ async def get_api_key_by_hash(self, key_hash):
+ rows = await async_execute(
+ self.cassandra, self.get_api_key_by_hash_stmt, (key_hash,),
+ )
+ return rows[0] if rows else None
+
+ async def get_api_key_by_id(self, id):
+ rows = await async_execute(
+ self.cassandra, self.get_api_key_by_id_stmt, (id,),
+ )
+ return rows[0] if rows else None
+
+ async def list_api_keys_by_user(self, user_id):
+ return await async_execute(
+ self.cassandra, self.list_api_keys_by_user_stmt, (user_id,),
+ )
+
+ async def delete_api_key(self, key_hash):
+ await async_execute(
+ self.cassandra, self.delete_api_key_stmt, (key_hash,),
+ )
+
+ # ------------------------------------------------------------------
+ # Signing keys
+ # ------------------------------------------------------------------
+
+ async def put_signing_key(self, kid, private_pem, public_pem,
+ created, retired):
+ await async_execute(
+ self.cassandra, self.put_signing_key_stmt,
+ (kid, private_pem, public_pem, created, retired),
+ )
+
+ async def list_signing_keys(self):
+ return await async_execute(
+ self.cassandra, self.list_signing_keys_stmt,
+ )
+
+ async def retire_signing_key(self, kid, retired):
+ await async_execute(
+ self.cassandra, self.retire_signing_key_stmt,
+ (retired, kid),
+ )
+
+ # ------------------------------------------------------------------
+ # User partial updates
+ # ------------------------------------------------------------------
+
+ async def update_user_profile(
+ self, id, name, email, roles, enabled, must_change_password,
+ ):
+ await async_execute(
+ self.cassandra, self.update_user_profile_stmt,
+ (
+ name, email,
+ set(roles) if roles else set(),
+ enabled, must_change_password, id,
+ ),
+ )
+
+ async def update_user_password(
+ self, id, password_hash, must_change_password,
+ ):
+ await async_execute(
+ self.cassandra, self.update_user_password_stmt,
+ (password_hash, must_change_password, id),
+ )
+
+ async def update_user_enabled(self, id, enabled):
+ await async_execute(
+ self.cassandra, self.update_user_enabled_stmt,
+ (enabled, id),
+ )
+
+ # ------------------------------------------------------------------
+ # Workspace updates
+ # ------------------------------------------------------------------
+
+ async def update_workspace(self, id, name, enabled):
+ await async_execute(
+ self.cassandra, self.update_workspace_stmt,
+ (name, enabled, id),
+ )
+
+ # ------------------------------------------------------------------
+ # Bootstrap helpers
+ # ------------------------------------------------------------------
+
+ async def any_workspace_exists(self):
+ rows = await self.list_workspaces()
+ return bool(rows)
diff --git a/trustgraph-vertexai/trustgraph/model/text_completion/googleaistudio/llm.py b/trustgraph-vertexai/trustgraph/model/text_completion/googleaistudio/llm.py
index 142fc45c..b01ff410 100644
--- a/trustgraph-vertexai/trustgraph/model/text_completion/googleaistudio/llm.py
+++ b/trustgraph-vertexai/trustgraph/model/text_completion/googleaistudio/llm.py
@@ -43,18 +43,6 @@ class Processor(LlmService):
temperature = params.get("temperature", default_temperature)
max_output = params.get("max_output", default_max_output)
- from google import genai
- from google.genai import types
- from google.genai.types import HarmCategory, HarmBlockThreshold
- from google.genai.errors import ClientError
- from google.api_core.exceptions import ResourceExhausted
- self.genai = genai
- self.types = types
- self.HarmCategory = HarmCategory
- self.HarmBlockThreshold = HarmBlockThreshold
- self.ClientError = ClientError
- self.ResourceExhausted = ResourceExhausted
-
if api_key is None:
raise RuntimeError("Google AI Studio API key not specified")
@@ -66,7 +54,7 @@ class Processor(LlmService):
}
)
- self.client = self.genai.Client(api_key=api_key, vertexai=False)
+ self.client = genai.Client(api_key=api_key, vertexai=False)
self.default_model = model
self.temperature = temperature
self.max_output = max_output
@@ -74,7 +62,7 @@ class Processor(LlmService):
# Cache for generation configs per model
self.generation_configs = {}
- block_level = self.HarmBlockThreshold.BLOCK_ONLY_HIGH
+ block_level = HarmBlockThreshold.BLOCK_ONLY_HIGH
self.safety_settings = [
types.SafetySetting(
@@ -159,7 +147,7 @@ class Processor(LlmService):
return resp
- except self.ResourceExhausted as e:
+ except ResourceExhausted as e:
logger.warning("Rate limit exceeded")