diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 07af8db9..02c546df 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -75,6 +75,13 @@ jobs: - name: Checkout uses: actions/checkout@v4 + - name: "Free up some disk space" + run: | + sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc + sudo rm -rf /opt/hostedtoolcache/CodeQL + podman image prune --all --force + podman builder prune -a -f + - name: Docker Hub token run: echo ${{ secrets.DOCKER_SECRET }} > docker-token.txt diff --git a/dev-tools/tests/smoke/smoke_ws_queries.py b/dev-tools/tests/smoke/smoke_ws_queries.py new file mode 100755 index 00000000..c6a4dfb6 --- /dev/null +++ b/dev-tools/tests/smoke/smoke_ws_queries.py @@ -0,0 +1,475 @@ +#!/usr/bin/env python3 +""" +WebSocket smoke / load test that hammers a TrustGraph gateway with a +mix of `embeddings`, `graph-embeddings`, and `triples` queries while +keeping a target number of in-flight requests at all times. + +Useful for reproducing the "worker hangs after a while, all subsequent +requests time out" failure mode — leaves enough load on the system to +saturate worker concurrency and reports per-service success/timeout +rates and latency distributions over time. + +Usage: + smoke_ws_queries.py --flow onto-rag --duration 120 --concurrency 20 + +Connects via /api/v1/socket using the first-frame auth protocol. +""" + +import argparse +import asyncio +import json +import os +import random +import statistics +import sys +import time +import uuid +from collections import defaultdict +from typing import Any + +import websockets + + +DEFAULT_TEXT = ( + "What caused the space shuttle to explode and what were the " + "main factors leading to the disaster?" +) + + +class Stats: + """Per-service rolling counters and latency samples.""" + + def __init__(self) -> None: + self.sent = 0 + self.ok = 0 + self.err = 0 + self.timeout = 0 + self.latencies_ms: list[float] = [] + + def record_ok(self, latency_ms: float) -> None: + self.ok += 1 + self.latencies_ms.append(latency_ms) + + def record_err(self) -> None: + self.err += 1 + + def record_timeout(self) -> None: + self.timeout += 1 + + def percentile(self, p: float) -> float: + if not self.latencies_ms: + return 0.0 + s = sorted(self.latencies_ms) + idx = min(len(s) - 1, int(len(s) * p)) + return s[idx] + + def summary(self) -> str: + if self.latencies_ms: + mn = min(self.latencies_ms) + mx = max(self.latencies_ms) + mean = statistics.mean(self.latencies_ms) + p50 = self.percentile(0.50) + p95 = self.percentile(0.95) + p99 = self.percentile(0.99) + lat = ( + f"min={mn:.0f} mean={mean:.0f} p50={p50:.0f} " + f"p95={p95:.0f} p99={p99:.0f} max={mx:.0f} ms" + ) + else: + lat = "no successful samples" + return ( + f"sent={self.sent} ok={self.ok} err={self.err} " + f"timeout={self.timeout} | {lat}" + ) + + +class WSClient: + """Thin async websocket client with first-frame auth and a shared + reader task that demuxes responses to per-request asyncio queues.""" + + def __init__( + self, url: str, token: str | None, workspace: str, + ping_timeout: int, + ) -> None: + self.url = url + self.token = token + self.workspace = workspace + self.ping_timeout = ping_timeout + self._ws: Any = None + self._pending: dict[str, asyncio.Queue] = {} + self._reader_task: asyncio.Task | None = None + self._closed = asyncio.Event() + + async def connect(self) -> None: + ws_url = self.url.rstrip("/") + "/api/v1/socket" + if ws_url.startswith("http://"): + ws_url = "ws://" + ws_url[len("http://"):] + elif ws_url.startswith("https://"): + ws_url = "wss://" + ws_url[len("https://"):] + elif not ( + ws_url.startswith("ws://") or ws_url.startswith("wss://") + ): + ws_url = "ws://" + ws_url + + self._ws = await websockets.connect( + ws_url, + ping_interval=20, + ping_timeout=self.ping_timeout, + max_size=64 * 1024 * 1024, + ) + + if self.token: + # First-frame auth handshake. + await self._ws.send(json.dumps({ + "type": "auth", "token": self.token, + })) + raw = await asyncio.wait_for(self._ws.recv(), timeout=10) + resp = json.loads(raw) + if resp.get("type") != "auth-ok": + await self._ws.close() + raise RuntimeError(f"auth failed: {resp}") + if "workspace" in resp: + # Server-resolved workspace overrides the user-supplied + # one, mirroring AsyncSocketClient behaviour. + self.workspace = resp["workspace"] + else: + print( + "WARNING: no token provided — skipping auth handshake. " + "Requests will be rejected unless the gateway is " + "running without IAM enforcement.", + file=sys.stderr, + ) + + self._reader_task = asyncio.create_task(self._reader()) + + async def _reader(self) -> None: + try: + async for raw in self._ws: + msg = json.loads(raw) + rid = msg.get("id") + if rid and rid in self._pending: + await self._pending[rid].put(msg) + except websockets.exceptions.ConnectionClosed: + pass + except Exception as e: + for q in list(self._pending.values()): + try: + q.put_nowait({"error": {"message": str(e)}}) + except Exception: + pass + finally: + self._closed.set() + + async def request( + self, service: str, flow: str | None, body: dict, timeout: float, + ) -> tuple[dict | None, str | None, float]: + """Send one request, await final response. + + Returns ``(response, error, latency_ms)``. ``response`` is None + on error/timeout. ``error`` describes the failure category. + """ + rid = str(uuid.uuid4()) + q: asyncio.Queue = asyncio.Queue() + self._pending[rid] = q + env = { + "id": rid, + "workspace": self.workspace, + "service": service, + "request": body, + } + if flow: + env["flow"] = flow + + t0 = time.monotonic() + try: + await self._ws.send(json.dumps(env)) + while True: + try: + msg = await asyncio.wait_for(q.get(), timeout=timeout) + except asyncio.TimeoutError: + return None, "timeout", (time.monotonic() - t0) * 1000 + if "error" in msg and msg["error"]: + err = msg["error"] + err_msg = ( + err.get("message") if isinstance(err, dict) else str(err) + ) + return None, f"error: {err_msg}", (time.monotonic() - t0) * 1000 + if msg.get("complete"): + return msg.get("response"), None, (time.monotonic() - t0) * 1000 + # Otherwise an intermediate streaming chunk — keep waiting. + finally: + self._pending.pop(rid, None) + + async def close(self) -> None: + if self._ws is not None: + await self._ws.close() + if self._reader_task is not None: + try: + await asyncio.wait_for(self._reader_task, timeout=2) + except (asyncio.TimeoutError, asyncio.CancelledError): + pass + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument( + "--url", + default=os.getenv("TRUSTGRAPH_URL", "http://localhost:8088/"), + help="Gateway URL (http or ws). Default: %(default)s", + ) + p.add_argument( + "--token", + default=os.getenv("TRUSTGRAPH_TOKEN"), + help="Auth token (or set TRUSTGRAPH_TOKEN). Optional — if " + "omitted, the auth handshake is skipped (only works " + "when the gateway is running without IAM enforcement).", + ) + p.add_argument( + "--workspace", default="default", + help="Workspace. Default: %(default)s", + ) + p.add_argument( + "--flow", required=True, + help="Flow id. Comma-separated for round-robin across flows " + "(e.g. onto-rag,doc-rag).", + ) + p.add_argument( + "--duration", type=int, default=60, + help="Test duration in seconds. Default: %(default)s", + ) + p.add_argument( + "--concurrency", type=int, default=15, + help="Target in-flight request count. Default: %(default)s", + ) + p.add_argument( + "--services", + default="embeddings,graph-embeddings,triples", + help="Comma-separated services to exercise. " + "Default: %(default)s", + ) + p.add_argument( + "--limit", type=int, default=3, + help="limit for triples / graph-embeddings queries. " + "Default: %(default)s", + ) + p.add_argument( + "--collection", default="default", + help="Collection. Default: %(default)s", + ) + p.add_argument( + "--text", default=DEFAULT_TEXT, + help="Text to embed for embeddings/seed.", + ) + p.add_argument( + "--vector-dim", type=int, default=384, + help="Dimension of synthetic vector when --no-seed is used. " + "Default: %(default)s", + ) + p.add_argument( + "--no-seed", action="store_true", + help="Skip the embeddings warm-up call. Use a random vector " + "for graph-embeddings queries instead.", + ) + p.add_argument( + "--request-timeout", type=float, default=30.0, + help="Per-request timeout (seconds). Default: %(default)s", + ) + p.add_argument( + "--report-interval", type=float, default=5.0, + help="How often to print stats (seconds). Default: %(default)s", + ) + p.add_argument( + "--ping-timeout", type=int, default=120, + help="Websocket ping timeout. Default: %(default)s", + ) + p.add_argument( + "--seed", type=int, default=None, + help="Random seed (for reproducibility).", + ) + return p.parse_args() + + +async def seed_vector( + client: WSClient, flow: str, text: str, timeout: float, +) -> list[float]: + """Issue one embeddings request to obtain a real vector that + later graph-embeddings calls can reuse.""" + resp, err, _ = await client.request( + "embeddings", flow, {"texts": [text]}, timeout, + ) + if err or not resp: + raise RuntimeError(f"seed embeddings failed: {err or resp}") + vectors = resp.get("vectors") + if not vectors: + raise RuntimeError(f"seed embeddings: no vectors in response: {resp}") + return vectors[0] + + +def make_request_body( + service: str, args: argparse.Namespace, vector: list[float], +) -> dict: + if service == "embeddings": + return {"texts": [args.text]} + if service == "graph-embeddings": + return { + "vector": vector, + "limit": args.limit, + "collection": args.collection, + } + if service == "triples": + return { + "limit": args.limit, + "collection": args.collection, + } + raise ValueError(f"Unknown service: {service}") + + +async def worker( + name: int, + client: WSClient, + flows: list[str], + services: list[str], + args: argparse.Namespace, + vector: list[float], + stats: dict[str, Stats], + in_flight: dict[str, int], + stop_at: float, +) -> None: + rng = random.Random((args.seed or 0) + name) + while time.monotonic() < stop_at: + svc = rng.choice(services) + flow = rng.choice(flows) + body = make_request_body(svc, args, vector) + + stats[svc].sent += 1 + in_flight[svc] += 1 + try: + resp, err, lat = await client.request( + svc, flow, body, args.request_timeout, + ) + if err == "timeout": + stats[svc].record_timeout() + elif err: + stats[svc].record_err() + else: + stats[svc].record_ok(lat) + except Exception as e: + stats[svc].record_err() + print(f"worker {name}: unexpected {svc} exception: {e}", + file=sys.stderr) + finally: + in_flight[svc] -= 1 + + +async def reporter( + services: list[str], + stats: dict[str, Stats], + in_flight: dict[str, int], + stop_at: float, + interval: float, +) -> None: + started = time.monotonic() + last_sent = {s: 0 for s in services} + while time.monotonic() < stop_at: + await asyncio.sleep(interval) + now = time.monotonic() + elapsed = now - started + total_inflight = sum(in_flight.values()) + print( + f"\n[{elapsed:6.1f}s] in-flight={total_inflight} " + f"per-svc={dict(in_flight)}" + ) + for svc in services: + s = stats[svc] + delta = s.sent - last_sent[svc] + rate = delta / interval + last_sent[svc] = s.sent + print(f" {svc:20s} {rate:6.1f}/s | {s.summary()}") + + +async def run(args: argparse.Namespace) -> int: + if args.seed is not None: + random.seed(args.seed) + + services = [s.strip() for s in args.services.split(",") if s.strip()] + flows = [f.strip() for f in args.flow.split(",") if f.strip()] + valid = {"embeddings", "graph-embeddings", "triples"} + bad = [s for s in services if s not in valid] + if bad: + print(f"ERROR: unknown service(s): {bad}. " + f"Supported: {sorted(valid)}", file=sys.stderr) + return 2 + + client = WSClient( + args.url, args.token, args.workspace, args.ping_timeout, + ) + print(f"Connecting to {args.url} ...") + await client.connect() + print(f"Connected. workspace={client.workspace} flows={flows} " + f"services={services} concurrency={args.concurrency} " + f"duration={args.duration}s") + + if "graph-embeddings" in services and not args.no_seed: + print("Seeding embedding vector ...") + vector = await seed_vector( + client, flows[0], args.text, args.request_timeout, + ) + print(f"Got vector of length {len(vector)}") + else: + vector = [random.uniform(-1.0, 1.0) for _ in range(args.vector_dim)] + + stats: dict[str, Stats] = defaultdict(Stats) + in_flight: dict[str, int] = defaultdict(int) + for svc in services: + stats[svc] # initialise + in_flight[svc] = 0 + + stop_at = time.monotonic() + args.duration + print(f"Starting load: {args.concurrency} workers for " + f"{args.duration}s ...") + + workers = [ + asyncio.create_task( + worker( + i, client, flows, services, args, vector, + stats, in_flight, stop_at, + ) + ) + for i in range(args.concurrency) + ] + rep = asyncio.create_task( + reporter(services, stats, in_flight, stop_at, args.report_interval) + ) + + try: + await asyncio.gather(*workers) + finally: + rep.cancel() + try: + await rep + except asyncio.CancelledError: + pass + + print("\n=== Final results ===") + any_failures = False + for svc in services: + s = stats[svc] + print(f" {svc:20s} {s.summary()}") + if s.timeout > 0 or s.err > 0: + any_failures = True + + await client.close() + + return 1 if any_failures else 0 + + +def main() -> int: + args = parse_args() + try: + return asyncio.run(run(args)) + except KeyboardInterrupt: + return 130 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/docs/tech-specs/bootstrap.md b/docs/tech-specs/bootstrap.md new file mode 100644 index 00000000..af7387d1 --- /dev/null +++ b/docs/tech-specs/bootstrap.md @@ -0,0 +1,297 @@ +--- +layout: default +title: "Bootstrap Framework Technical Specification" +parent: "Tech Specs" +--- + +# Bootstrap Framework Technical Specification + +## Overview + +A generic, pluggable framework for running one-time initialisation steps +against a TrustGraph deployment — replacing the dedicated +`tg-init-trustgraph` container with a long-running processor that +converges the system to a desired initial state and then idles. + +The framework is content-agnostic. It knows how to run, retry, +mark-as-done, and surface failures; the actual init work lives in +small pluggable classes called **initialisers**. Core initialisers +ship in the `trustgraph-flow` package; enterprise and third-party +initialisers can be loaded by dotted path without any core code +change. + +## Motivation + +The existing `tg-init-trustgraph` is a one-shot CLI run in its own +container. It performs two very different jobs (Pulsar topology +setup and config seeding) in a single script, is wasteful as a whole +container, cannot handle partial-success states, and has no way to +extend the boot process with enterprise-specific concerns (user +provisioning, workspace initialisation, IAM scaffolding) without +forking the tool. + +A pluggable, long-running reconciler addresses all of this and slots +naturally into the existing processor-group model. + +## Design + +### Bootstrapper Processor + +A single `AsyncProcessor` subclass. One entry in a processor group. +Parameters include the processor's own identity and a list of +**initialiser specifications** — each spec names a class (by dotted +path), a unique instance name, a flag string, and the parameters +that will be passed to the initialiser's constructor. + +On each wake the bootstrapper does the following, in order: + +1. Open a short-lived context (config client, flow-svc client, + logger). The context is torn down at the end of the wake so + steady-state idle cost is effectively nil. +2. Run all **pre-service initialisers** (those that opt out of the + service gate — principally `PulsarTopology`, which must run + before the services it gates on can even come up). +3. Check the **service gate**: cheap round-trips to config-svc and + flow-svc. If either fails, skip to the sleep step using the + short gate-retry cadence. +4. Run all **post-service initialisers** that haven't already + completed at the currently-configured flag. +5. Sleep. Cadence adapts to state (see below). + +### Initialiser Contract + +An initialiser is a class with: + +- A class-level `name` identifier, unique within the bootstrapper's + configuration. This is the key under which completion state is + stored. +- A class-level `wait_for_services` flag. When `True` (the default) + the initialiser runs only after the service gate passes. When + `False`, it runs before the gate, on every wake. +- A constructor that accepts the initialiser's own params as kwargs. +- An async `run(ctx, old_flag, new_flag)` method that performs the + init work and returns on success. Any raised exception is + logged and treated as a transient failure — the stored flag is + not updated and the initialiser will re-run on the next cycle. + +`old_flag` is the previously-stored flag string, or `None` if the +initialiser has never successfully run in this deployment. `new_flag` +is the flag the operator has configured for this run. This pair +lets an initialiser distinguish a clean first-run from a migration +between flag versions and behave accordingly (see "Flag change and +re-run safety" below). + +### Context + +The context is the bootstrapper-owned object passed to every +initialiser's `run()` method. Its fields are deliberately narrow: + +| Field | Purpose | +|---|---| +| `logger` | A child logger named for the initialiser instance | +| `config` | A short-lived `ConfigClient` for config-svc reads/writes | +| `flow` | A short-lived `RequestResponse` client for flow-svc | + +The context is always fully-populated regardless of which services +a given initialiser uses, for symmetry. Additional fields may be +added in future without breaking existing initialisers. Clients are +started at the beginning of a wake cycle and stopped at the end. + +Initialisers that need services beyond config-svc and flow-svc are +responsible for their own readiness checks and for raising cleanly +when a prerequisite is not met. + +### Completion State + +Per-initialiser completion state is stored in the reserved +`__system__` workspace, under a dedicated config type for bootstrap +state. The stored value is the flag string that was configured when +the initialiser last succeeded. + +On each cycle, for each initialiser, the bootstrapper reads the +stored flag and compares it to the currently-configured flag. If +they match, the initialiser is skipped silently. If they differ, +the initialiser runs; on success, the stored flag is updated. + +Because the state lives in a reserved (`_`-prefixed) workspace, it +is stored by config-svc but excluded from the config push broadcast. +Live processors never see it and cannot act on it. + +### The Service Gate + +The gate is a cheap, bootstrapper-internal check that config-svc +and flow-svc are both reachable and responsive. It is intentionally +a simple pair of low-cost round-trips — a config list against +`__system__` and a flow-svc `list-blueprints` — rather than any +deeper health check. + +Its purpose is to avoid filling logs with noise and to concentrate +retry effort during the brief window when services are coming up. +The gate is applied only to initialisers with +`wait_for_services=True` (the default); `False` is reserved for +initialisers that set up infrastructure the gate itself depends on. + +### Adaptive Cadence + +The sleep between wake cycles is chosen from three tiers based on +observed state: + +| Tier | Duration | When | +|---|---|---| +| Gate backoff | ~5 s | Services not responding — concentrate retry during startup | +| Init retry | ~15 s | Gate passes but at least one initialiser is not yet at its configured flag — transient failures, waiting on prereqs, recently-bumped flag not yet applied | +| Steady | ~300 s | All configured initialisers at their configured flag; gate passes; nothing to do | + +The short tiers ensure a fresh deployment converges quickly; +steady state costs a single round-trip per initialiser every few +minutes. + +### Failure Handling + +An initialiser raising an exception does not stop the bootstrapper +or block other initialisers. Each initialiser in the cycle is +attempted independently; failures are logged and retried on the next +cycle. This means there is no ordered-DAG enforcement: order of +initialisers in the configuration determines the attempt order +within a cycle, but a dependency between two initialisers is +expressed by the dependant raising cleanly when its prerequisite +isn't satisfied. Over successive cycles the system converges. + +### Flag Change and Re-run Safety + +Each initialiser's completion state is a string flag chosen by the +operator. Typically these follow a simple version pattern +(`v1`, `v2`, ...), but the bootstrapper imposes no format. + +Changing the flag in the group configuration causes the +corresponding initialiser to re-run on the next cycle. Initialisers +must be written so that re-running after a flag bump is safe — they +receive both the previous and the new flag and are responsible for +either cleanly re-applying the work or performing a step-change +migration from the prior state. + +This gives operators an explicit, visible mechanism for triggering +re-initialisation. Re-runs are never implicit. + +## Core Initialisers + +The following initialisers ship in `trustgraph.bootstrap.initialisers` +and cover the base deployment case. + +### PulsarTopology + +Creates the Pulsar tenant and the four namespaces +(`flow`, `request`, `response`, `notify`) with appropriate +retention policies if they don't exist. + +Opts out of the service gate (`wait_for_services = False`) because +config-svc and flow-svc cannot come online until the Pulsar +namespaces exist. + +Parameters: Pulsar admin URL, tenant name. + +Idempotent via the admin API (GET-then-PUT). Flag change causes +re-evaluation of all namespaces; any absent are created. + +### TemplateSeed + +Populates the reserved `__template__` workspace from an external +JSON seed file. The seed file has the standard shape of +`{config-type: {config-key: value}}`. + +Runs post-gate. Parameters: path to the seed file, overwrite +policy (upsert-missing only, or overwrite-all). + +On clean run, writes the whole file. On flag change, behaviour +depends on the overwrite policy — typically upsert-missing so +that operator-customised keys are preserved across seed-file +upgrades. + +### WorkspaceInit + +Creates a named workspace and populates it from the seed file or +from the full contents of the `__template__` workspace. + +Runs post-gate. Parameters: workspace name, source (seed file or +`__template__`), optional `seed_file` path, `overwrite` flag. + +When `source` is `template`, the initialiser copies every config +type and key present in `__template__` — there is no per-type +selection. Deployments that want to seed only a subset should +either curate the seed file they feed to `TemplateSeed` or use +`source: seed-file` directly here. + +Raises cleanly if its source does not exist — depends on +`TemplateSeed` having run in the same cycle or a prior one. + +### DefaultFlowStart + +Starts a specific flow in a specific workspace using a specific +blueprint. + +Runs post-gate. Parameters: workspace name, flow id, blueprint +name, description, optional parameter overrides. + +Separated from `WorkspaceInit` deliberately so that deployments +which want a workspace without an auto-started flow can simply omit +this initialiser from their bootstrap configuration. + +## Extensibility + +New initialisers are added by: + +1. Subclassing the initialiser base class. +2. Implementing `run(ctx, old_flag, new_flag)`. +3. Choosing `wait_for_services` (almost always `True`). +4. Adding an entry in the bootstrapper's configuration with the new + class's dotted path. + +No core code changes are required to add an enterprise or third-party +initialiser. Enterprise builds ship their own package with their own +initialiser classes (e.g. `CreateAdminUser`, `ProvisionWorkspaces`) +and reference them in the bootstrapper config alongside the core +initialisers. + +## Reserved Workspaces + +This specification relies on the "reserved workspace" convention: + +- Any workspace id beginning with `_` is reserved. +- Reserved workspaces are stored normally by config-svc but never + appear in the config push broadcast. +- Live processors cannot react to reserved-workspace state. + +The bootstrapper uses two reserved workspaces: + +- `__template__` — factory-default seed config, readable by + initialisers that copy-from-template. +- `__system__` — bootstrapper completion state (under the + `init-state` config type) and any other system-internal bookkeeping. + +See the reserved-workspace convention in the config service for +the general rule and its enforcement. + +## Non-Goals + +- No DAG scheduling across initialisers. Dependencies are expressed + by the dependant failing cleanly until its prerequisite is met, + and convergence over subsequent cycles. +- No parallel execution of initialisers within a cycle. A cycle runs + each initialiser sequentially. +- No implicit re-runs. Re-running an initialiser requires an explicit + flag change by the operator. +- No cross-initialiser atomicity. Each initialiser's completion is + recorded independently on its own success. + +## Operational Notes + +- Running the bootstrapper as a processor-group entry replaces the + previous `tg-init-trustgraph` container. The bootstrapper is also + CLI-invocable directly for standalone testing via + `Processor.launch(...)`. +- First-boot convergence is typically a handful of short cycles + followed by a transition to the steady cadence. Deployments + should expect the first few minutes of logs to show + initialisation activity, thereafter effective silence. +- Bumping a flag is a deliberate operational act. The log line + emitted on re-run makes the event visible for audit. diff --git a/docs/tech-specs/capabilities.md b/docs/tech-specs/capabilities.md new file mode 100644 index 00000000..7717cbc9 --- /dev/null +++ b/docs/tech-specs/capabilities.md @@ -0,0 +1,273 @@ +--- +layout: default +title: "Capability Vocabulary Technical Specification" +parent: "Tech Specs" +--- + +# Capability Vocabulary Technical Specification + +## Overview + +Every gateway endpoint maps to exactly one *capability* — a string +from a closed vocabulary defined in this document. When the +gateway authorises a request, it hands the IAM regime four things: +the authenticated identity, the required capability, the +operation's resource (the structured identifier of what's being +operated on), and the operation's parameters. The IAM regime +decides allow or deny; see the [IAM contract](iam-contract.md) for +the full abstraction. + +A capability is a **permission**, not a structural classification. +`graph:read` says "the caller may read graphs"; it does not say +where graphs live or how they are addressed. The shape of a +request — whether workspace appears in the URL, the envelope, or +the body, and whether it is a resource address component or an +operation parameter — is determined by what the operation operates +on, not by what permission it requires. Permission and structure +are orthogonal; the contract takes both. + +This document defines: + +- The **capability vocabulary** — the closed list of capability + strings the gateway uses as input to `authorise`. All IAM + regimes share this vocabulary; that's the only schema the + gateway and the IAM regime have to agree on. +- The **open-source role bundles** — the role-and-scope table the + OSS IAM regime uses to answer `authorise` calls. Other regimes + answer the same call differently; the bundles below are an + OSS-specific implementation detail, not a contract assertion. + +A regime may evaluate `authorise` using role bundles (OSS), IdP +group memberships, attribute-based policies, relationship tuples, +or any other mechanism. The gateway is unaware of which. The +capability strings — and the resource component vocabulary the +gateway populates alongside them — are the only thing both sides +have to agree on. + +## Motivation + +The original IAM spec used hierarchical "minimum role" checks +(`admin` implies `writer` implies `reader`). That shape is simple +but paints the role model into a corner: any enterprise need to +grant a subset of admin abilities (helpdesk that can reset +passwords but not edit flows; analyst who can query but not ingest) +requires a protocol-level change. + +A capability vocabulary decouples "what a request needs" from +"what roles a user has" and makes the role table pure data. The +open-source bundles can stay coarse while the enterprise role +table expands without any code movement. + +## Design + +### Capability string format + +`:` or `` (for capabilities with no +natural read/write split). All lowercase, kebab-case for +multi-word subsystems. + +### Capability list + +**Data plane** + +| Capability | Covers | +|---|---| +| `agent` | agent (query-only; no write counterpart) | +| `graph:read` | graph-rag, graph-embeddings-query, triples-query, sparql, graph-embeddings-export, triples-export | +| `graph:write` | triples-import, graph-embeddings-import | +| `documents:read` | document-rag, document-embeddings-query, document-embeddings-export, entity-contexts-export, document-stream-export, library list / fetch | +| `documents:write` | document-embeddings-import, entity-contexts-import, text-load, document-load, library add / replace / delete | +| `rows:read` | rows-query, row-embeddings-query, nlp-query, structured-query, structured-diag | +| `rows:write` | rows-import | +| `llm` | text-completion, prompt (stateless invocation) | +| `embeddings` | Raw text-embedding service (stateless compute; typed-data embedding stores live under their data-subject capability) | +| `mcp` | mcp-tool | +| `collections:read` | List / describe collections | +| `collections:write` | Create / delete collections | +| `knowledge:read` | List / get knowledge cores | +| `knowledge:write` | Create / delete knowledge cores | + +**Control plane** + +| Capability | Covers | +|---|---| +| `config:read` | Read workspace config | +| `config:write` | Write workspace config | +| `flows:read` | List / describe flows, blueprints, flow classes | +| `flows:write` | Start / stop / update flows | +| `users:read` | List / get users within the workspace | +| `users:write` | Create / update / disable users within the workspace | +| `users:admin` | Assign / remove roles on users within the workspace | +| `keys:self` | Create / revoke / list **own** API keys | +| `keys:admin` | Create / revoke / list **any user's** API keys within the workspace | +| `workspaces:admin` | Create / delete / disable workspaces (system-level) | +| `iam:admin` | JWT signing-key rotation, IAM-level operations | +| `metrics:read` | Prometheus metrics proxy | + +### Open-source role bundles + +The open-source edition ships three roles: + +| Role | Capabilities | +|---|---| +| `reader` | `agent`, `graph:read`, `documents:read`, `rows:read`, `llm`, `embeddings`, `mcp`, `collections:read`, `knowledge:read`, `flows:read`, `config:read`, `keys:self` | +| `writer` | everything in `reader` **+** `graph:write`, `documents:write`, `rows:write`, `collections:write`, `knowledge:write` | +| `admin` | everything in `writer` **+** `config:write`, `flows:write`, `users:read`, `users:write`, `users:admin`, `keys:admin`, `workspaces:admin`, `iam:admin`, `metrics:read` | + +Open-source bundles are deliberately coarse. `workspaces:admin` and +`iam:admin` live inside `admin` without a separate role; a single +`admin` user holds the keys to the whole deployment. + +### The `agent` capability and composition + +The `agent` capability is granted independently of the capabilities +it composes under the hood (`llm`, `graph`, `documents`, `rows`, +`mcp`, etc.). A user holding `agent` but not `llm` can still cause +LLM invocations because the agent implementation chooses which +services to invoke on the caller's behalf. + +This is deliberate. A common policy is "allow controlled access +via the agent, deny raw model calls" — granting `agent` without +granting `llm` expresses exactly that. An administrator granting +`agent` should treat it as a grant of everything the agent +composes at deployment time. + +### Authorisation evaluation (OSS regime) + +This section describes how the OSS IAM regime answers +`authorise(identity, capability, resource, parameters)`. Other +regimes answer the same contract differently; only the inputs (the +capability vocabulary, the resource components, the parameter +shape) are shared. + +For a request bearing a resolved set of roles +`R = {r1, r2, ...}`, a required capability `c`, a resource, and +parameters: + +``` +let target_workspace = + resource.workspace (workspace-/flow-level resources) + or parameters.workspace (system-level resources whose + parameters reference a workspace) + or unset (system-level operations with no + workspace context) + +allow if some role r in R has c in its capability bundle + and (target_workspace is unset + or r's workspace_scope permits target_workspace) +``` + +The OSS regime considers workspace from whichever role it plays in +the operation: + +- For workspace-level and flow-level resources, the workspace lives + in `resource.workspace` and that is what the role's scope is + checked against. +- For system-level resources whose operation parameters reference a + workspace (e.g. `create-user with workspace association W`), + workspace lives in `parameters.workspace` and that is what the + role's scope is checked against. The resource is system-level + (`resource = {}`) but the workspace constraint still bites. +- For system-level operations with no workspace context (e.g. + `bootstrap`, `rotate-signing-key`), the workspace-scope check + collapses — only capability-bundle membership matters. + +No hierarchy, no precedence, no role-order sensitivity. A user +with a single role is the common case; a user with multiple roles +is allowed if any role independently grants both the capability +and the relevant workspace scope. + +### Enforcement boundary + +Capability checks — and authentication — are applied **only at the +API gateway**, on requests arriving from external callers. +Operations originating inside the platform (backend service to +backend service, agent to LLM, flow-svc to config-svc, bootstrap +initialisers, scheduled reconcilers, autonomous flow steps) are +**not capability-checked**. Backend services trust the workspace +set by the gateway on inbound pub/sub messages and trust +internally-originated messages without further authorisation. + +This policy has four consequences that are part of the spec, not +accidents of implementation: + +1. **The gateway is the single trust boundary for user + authorisation.** Every backend service is a downstream consumer + of an already-authorised workspace scope. +2. **Pub/sub carries workspace, not user identity.** Messages on + the bus do not carry credentials or the identity that originated + a request; they carry the resolved workspace only. This keeps + the bus protocol free of secrets and aligns with the workspace + resolver's role as the gateway-side narrowing step. +3. **Composition is transitive.** Granting a capability that the + platform composes internally (for example, `agent`) transitively + grants everything that capability composes under the hood, + because the downstream calls are internal-origin and are not + re-checked. The composite nature of `agent` described above is + a consequence of this policy, not a special case. +4. **Internal-origin operations have no user.** Bootstrap, + reconcilers, and other platform-initiated work act with + system-level authority. The workspace field on such messages + identifies which workspace's data is being touched, not who + asked. + +**Trust model.** Whoever has pub/sub access is implicitly trusted +to act as any workspace. Defense-in-depth within the backend is +not part of this design; the security perimeter is the gateway +and the bus itself (TLS / network isolation between the bus and +any untrusted network). + +### Unknown capabilities and unknown roles + +- An endpoint declaring an unknown capability is a server-side bug + and fails closed (403, logged). +- A user carrying a role name that is not defined in the role table + is ignored for authorisation purposes and logged as a warning. + Behaviour is deterministic: unknown roles contribute zero + capabilities. + +### Capability scope + +Every capability is **implicitly scoped to the caller's resolved +workspace**. A `users:write` capability does not permit a user +in workspace `acme` to create users in workspace `beta` — the +workspace-resolver has already narrowed the request to one +workspace before the capability check runs. See the IAM +specification for the workspace-resolver contract. + +The three exceptions are the system-level capabilities +`workspaces:admin` and `iam:admin`, which operate across +workspaces by definition, and `metrics:read`, which returns +process-level series not scoped to any workspace. + +## Enterprise extensibility + +Enterprise editions extend the role table additively: + +``` +data-analyst: {query, library:read, collections:read, knowledge:read} +helpdesk: {users:read, users:write, users:admin, keys:admin} +data-engineer: writer + {flows:read, config:read} +workspace-owner: admin − {workspaces:admin, iam:admin} +``` + +None of this requires a protocol change — the wire-protocol `roles` +field on user records is already a set, the gateway's +capability-check is already capability-based, and the capability +vocabulary is closed. Enterprises may introduce roles whose bundles +compose the same capabilities differently. + +When an enterprise introduces a new capability (e.g. for a feature +that does not exist in open source), the capability string is +added to the vocabulary and recognised by the gateway build that +ships that feature. + +## References + +- [IAM Contract Specification](iam-contract.md) — the abstract + gateway↔IAM regime contract; capability strings are inputs to + `authorise`. +- [Identity and Access Management Specification](iam.md) +- [IAM Service Protocol Specification](iam-protocol.md) — the OSS + regime's wire-level protocol. +- [Architecture Principles](architecture-principles.md) diff --git a/docs/tech-specs/data-ownership-model.md b/docs/tech-specs/data-ownership-model.md index ea94ec46..b112d195 100644 --- a/docs/tech-specs/data-ownership-model.md +++ b/docs/tech-specs/data-ownership-model.md @@ -22,8 +22,16 @@ are the boundaries around data, and who owns what? A workspace is the primary isolation boundary. It represents an organisation, team, or independent operating unit. All data belongs to -exactly one workspace. Cross-workspace access is never permitted through -the API. +exactly one workspace. + +Cross-workspace access through the API is gated by the IAM regime +(see [`iam-contract.md`](iam-contract.md)). In the OSS distribution, +the role table defined in [`capabilities.md`](capabilities.md) +permits cross-workspace operation only to the `admin` role; the +`reader` and `writer` roles are constrained to a single assigned +workspace per credential. Other regimes can model the relationship +between identity and workspace differently — the gateway makes no +assumption. A workspace owns: - Source documents @@ -279,9 +287,18 @@ A typical workflow: The current codebase uses a `user` field in message metadata and storage partition keys to identify the workspace. The `collection` field -identifies the collection within that workspace. The IAM spec describes -how the gateway maps authenticated credentials to a workspace identity -and sets these fields. +identifies the collection within that workspace. + +The gateway is the single point at which workspace gets stamped onto +outbound pub/sub messages. An incoming credential authenticates to a +workspace (the credential's binding, not a user-to-workspace lookup — +see [`iam-contract.md`](iam-contract.md) and the *Identity surface* +section of [`iam.md`](iam.md)); any caller-supplied workspace on the +request is reconciled against the authenticated identity by the IAM +regime; the resolved value is what the gateway writes into outgoing +messages and the storage layers' partition keys. Backend services +trust the workspace they receive — defense-in-depth happens at the +gateway, not at the bus. For details on how each storage backend implements this scoping, see: @@ -302,7 +319,10 @@ For details on how each storage backend implements this scoping, see: ## References -- [Identity and Access Management](iam.md) +- [IAM Contract](iam-contract.md) — gateway↔IAM regime abstraction. +- [Identity and Access Management](iam.md) — gateway-side framing. +- [Capability Vocabulary](capabilities.md) — capability strings and + the OSS role bundles that decide cross-workspace eligibility. - [Collection Management](collection-management.md) - [Entity-Centric Graph](entity-centric-graph.md) - [Neo4j User Collection Isolation](neo4j-user-collection-isolation.md) diff --git a/docs/tech-specs/flow-class-definition.md b/docs/tech-specs/flow-blueprint-definition.md similarity index 100% rename from docs/tech-specs/flow-class-definition.md rename to docs/tech-specs/flow-blueprint-definition.md diff --git a/docs/tech-specs/iam-contract.md b/docs/tech-specs/iam-contract.md new file mode 100644 index 00000000..da23fb31 --- /dev/null +++ b/docs/tech-specs/iam-contract.md @@ -0,0 +1,403 @@ +--- +layout: default +title: "IAM Contract Technical Specification" +parent: "Tech Specs" +--- + +# IAM Contract Technical Specification + +## Overview + +The IAM contract is the abstraction between the API gateway and any +identity / access management regime that fronts it. The gateway +treats IAM as a black box behind two operations — *authenticate* and +*authorise* — plus a small surface of management operations. No +regime-specific concept (roles, scopes, groups, claims, policy +languages) is visible to the gateway, and no gateway-specific +concept (capability vocabulary, request anatomy) is visible to +backend services. + +The TrustGraph open-source distribution ships one IAM regime — a +role-based implementation defined in +[`iam-protocol.md`](iam-protocol.md) — that is one implementation of +this contract. Enterprise editions can replace it with a different +regime (OIDC / SSO, ABAC, ReBAC, external policy engine) without +changing the gateway, the wire protocol, or the backends. + +## Motivation + +Authorisation models vary by deployment. A small team might be +happy with three predefined roles; an enterprise might need group- +mapping from an upstream IdP, attribute-based policies, or +relationship-based access control. Hard-wiring any one of those +into the gateway forces every other regime to either compromise its +model or be re-implemented. + +A narrow contract — "authenticate this credential" and "may this +identity perform this operation on this resource" — captures what +the gateway actually needs to know without committing to a policy +shape. The IAM regime owns the policy decision; the gateway is a +generic enforcement point. + +## Operations + +### `authenticate` + +``` +authenticate(credential: bytes) → Identity | AuthFailure +``` + +Validates a credential the client presented. The gateway treats +the credential as opaque bytes — for the OSS regime today that's +either an API key plaintext or a JWT, but the gateway does not +parse them; the IAM regime decides. + +On success, returns an `Identity`. On any failure the IAM regime +returns the same opaque `AuthFailure` — never a description of which +condition failed. This is the spec's masked-error rule: an +attacker probing the endpoint cannot distinguish "no such key", +"expired", "wrong signature", "revoked", "user disabled", etc. + +### `authorise` + +``` +authorise(identity: Identity, + capability: str, + resource: Resource, + parameters: dict) + → Decision +``` + +Asks whether the identity is permitted to perform the named +capability on the named resource, given the operation's +parameters. Returns `allow` or `deny`. `identity` is whatever +`authenticate` returned for this caller; the gateway never +decomposes it. + +The four arguments separate concerns: + +- **`identity`** — who is asking. +- **`capability`** — what permission they are exercising (e.g. + `users:write`, `graph:read`). Permission, not structure. +- **`resource`** — what is being operated on, as a structured + identifier. See *The Resource model* below. +- **`parameters`** — operation-specific data that the regime may + need to consider beyond the resource identifier. Used when a + decision depends on attributes the request supplies — e.g. + creating a user *with workspace association W*: the resource is + the system-level user registry, and W is a parameter the regime + checks against the caller's permissions for `users:write`. + +Different regimes use the four arguments differently — one regime +might evaluate role bundles whose grants carry workspace scope; +another might consult upstream IdP group memberships; an ABAC +regime evaluates a policy with all four as inputs. The contract +is unchanged. + +### `authorise_many` + +``` +authorise_many(identity: Identity, + checks: list[(str, Resource, dict)]) + → list[Decision] +``` + +Bulk variant of `authorise`. Same semantics, one round-trip for +many decisions. Used when an operation fans out to multiple +resources (e.g. an agent that touches several workspaces) and a +single permission check isn't sufficient. + +`authorise_many` is not just a performance optimisation; it pins +the contract for fan-out operations early, before clients (or +internal callers) build patterns that assume one-permission-check- +per-request. Regimes implement it as a loop over `authorise` +unless they have a more efficient path. + +### Management operations + +Beyond the request-time `authenticate` / `authorise`, the contract +also covers identity-lifecycle and credential-lifecycle operations +that are invoked by administrative requests rather than by the +authentication path. These are regime-specific in detail (an SSO +regime that delegates user management to the IdP may not implement +most of them) but the operation set the gateway can forward is: + +- User management: `create-user`, `list-users`, `get-user`, + `update-user`, `disable-user`, `enable-user`, `delete-user` +- Credential management: `create-api-key`, `list-api-keys`, + `revoke-api-key`, `change-password`, `reset-password` +- Workspace management: `create-workspace`, `list-workspaces`, + `get-workspace`, `update-workspace`, `disable-workspace` +- Session management: `login`, `whoami` +- Key management: `get-signing-key-public`, `rotate-signing-key` +- Bootstrap: `bootstrap`, `bootstrap-status` + +`whoami` is the self-read counterpart to `get-user`: any +authenticated caller can read their own identity record without +holding a user-management capability. It is the gating-free probe +a UI uses to render affordances appropriate to the caller's role. + +`bootstrap-status` is a side-effect-free probe of whether an +unconsumed `bootstrap` call would currently succeed. It exists so +a first-run UI can decide whether to render setup without invoking +the consuming `bootstrap` op. Public — no authentication. + +A regime that does not support one of these (e.g. an SSO regime +where users are managed in the IdP) returns a defined "not +supported" error; the gateway surfaces it as a 501. + +### Actor injection + +For any management operation forwarded by the gateway after +authentication, the gateway injects the authenticated caller's +`handle` as an `actor` field on the request. Regimes use `actor` +to identify *who is making the request* — distinct from the +operation's target (which lives in `user_id` / `key_id` / +`workspace_record` / etc.) — for purposes such as: + +- Self-service operations (`whoami`, `change-password`) that + resolve "the caller" without taking a target argument. +- Audit logging, where the actor is recorded against the change. +- Decisions that depend on the resolved resource state. The + gateway authorises against the parameters on the request, but it + cannot know the resolved resource's actual properties (e.g. the + workspace association of a target user) before the regime loads + it. When that matters, the regime can re-decide using the + actor's permissions and the resolved record — closing a class + of cases the gateway-side check can't see. + +Caller-supplied `actor` values on the request body are overwritten +by the gateway — the gateway is the only authority for actor +identity, and a regime that consults `actor` can rely on it being +authentic. + +## The `Identity` surface + +`Identity` is *mostly* opaque. The gateway holds the value as a +token to quote back when calling `authorise`, never decomposing it. +But there are a few gateway-side concerns that need a small +surface: + +| Field | Purpose | +|---|---| +| `handle` | Opaque reference passed back to `authorise`. Regime-defined; gateway treats as a string. | +| `workspace` | The workspace this credential authenticates to. Used by the gateway only as a default-fill-in for operations that omit a workspace. Never used as policy input — when authorisation needs to know which workspace the operation acts on, the operation places it in the resource address (or a parameter), and the regime decides. | +| `principal_id` | Stable identifier the gateway logs for audit (a user id, a sub claim, a service account id). Never used for authorisation — that's `authorise`'s job. | +| `source` | How the credential was presented (`api-key`, `jwt`, …). Non-policy; useful for logs and metrics only. | + +Anything else — roles, claims, group memberships, policy attributes +— stays inside the regime and is reachable only via `authorise`. + +## The `Resource` model + +A `Resource` is a structured value identifying *what is being +operated on*. Resources live at one of three levels in TrustGraph, +based on where the resource exists in the deployment: + +### Resource levels + +| Level | What lives there | Resource shape | +|---|---|---| +| **System** | The user registry, the workspace registry, the signing key, the audit log — anything that exists once per deployment. | `{}` | +| **Workspace** | A workspace's config, flow definitions, library (documents), knowledge cores, collections — things that exist *within* a workspace. | `{workspace: "..."}` | +| **Flow** | A flow's knowledge graph, agent state, LLM context, embedding state, MCP context — things that exist *within* a flow within a workspace. | `{workspace: "...", flow: "..."}` | + +Note carefully: + +- **Users are a system-level resource.** A user record exists at + the deployment level; the fact that a user has a *workspace + association* (one in OSS, possibly many in other regimes) is a + property of the user record, not a containment. Operations on + the user registry have `resource = {}`; the workspace + association appears as a *parameter*, not as a resource address + component. +- **Workspaces themselves are a system-level resource.** The + workspace registry exists at the deployment level. `create- + workspace` and `list-workspaces` are system-level operations; + the workspace identifier in their bodies is a parameter, not an + address. +- **A workspace's contents are workspace-level resources.** A + workspace's config, flows, library, etc. live within a + workspace. Their resource address is `{workspace: ...}`. +- **A flow's contents are flow-level resources.** Knowledge + graphs, agents, etc. live within a flow. Their resource + address is `{workspace: ..., flow: ...}`. + +### Component vocabulary + +| Component | Type | Meaning | Used by | +|---|---|---|---| +| `workspace` | string | Identifier of the workspace whose contents are being operated on | workspace-level and flow-level resource addresses | +| `flow` | string | Identifier of a flow within a workspace; always paired with `workspace` | flow-level resource addresses | +| `collection` | string | Reserved for finer-grained scoping within a workspace | future / enterprise | +| `document` | string | Reserved for per-document scoping | future / enterprise | + +A `Resource` is a partial mapping of these components to values. +The level of the resource (system / workspace / flow) determines +which components must be present. An empty `{}` is the +system-level resource. + +### Workspace as parameter vs. address + +Workspace plays two distinct roles in operations and shows up in +two distinct places: + +- **As a resource address component** — workspace identifies the + thing being operated on. Lives in `resource.workspace`. Example: + `config:read` reads the config *of* workspace W. +- **As an operation parameter** — workspace is data the operation + acts on or filters by, while the resource itself is system-level. + Lives in `parameters.workspace`. Example: `users:write` + creates a user *with workspace association* W; the resource is + the user registry (system), and W is a parameter. + +These are not interchangeable. The IAM regime considers each role +separately; the OSS role table, for instance, applies workspace- +scope to the address component when checking workspace-level +operations, and to a parameter when checking +"create-user-with-workspace-W". Both end up enforcing the admin's +scope, but through different code paths. + +### Extension rules + +The vocabulary is closed but extensible. Adding a new component: + +1. The component is added to the vocabulary in this spec, with a + defined name, type, and meaning. +2. Existing IAM regimes ignore unknown components (forward + compatibility — adding a new component does not break older + regimes that don't understand it). +3. Older gateways that don't populate a new component leave it + unset; regimes that need it for a decision treat "unset" as + "absent" and decide accordingly (typically: cannot grant + permission scoped to a component the gateway didn't supply). + +A regime that wants stricter behaviour (e.g. fail-closed on +unknown components rather than ignoring them) declares so as part +of its own configuration; the contract default is "ignore unknown". + +## Operation registry (gateway-side) + +Mapping a request onto `(capability, resource, parameters)` is +service-specific — it cannot be inferred from the capability +alone. The gateway maintains an **operation registry** that +declares, per operation: + +- The required capability. +- The resource level (system / workspace / flow) — determines the + shape of the resource identifier. +- How to extract the resource address components (workspace, + flow) from the request — from URL path, WebSocket envelope, or + body. +- Which body fields are operation parameters (and which of those + the IAM regime should see in the `parameters` argument). + +This registry is part of the gateway's endpoint declarations, not +part of the IAM contract. The contract specifies what arguments +`authorise` receives; how the gateway populates them is its own +concern. + +In the OSS gateway, registry keys follow these conventions: + +| Pattern | Used by | Resource level | +|---|---|---| +| bare op name (`create-user`, `list-users`, `login`, …) | `/api/v1/iam` and the auth surface | system / workspace, per op | +| `:` (`config:get`, `flow:list-blueprints`, `librarian:add-document`, …) | `/api/v1/{kind}` (workspace-scoped global services) | workspace | +| `flow-service:` (`flow-service:agent`, `flow-service:graph-rag`, …) | `/api/v1/flow/{flow}/service/{kind}` and the WS Mux | flow | +| `flow-import:` / `flow-export:` | `/api/v1/flow/{flow}/{import,export}/{kind}` streaming sockets | flow | + +Keys are an OSS-gateway implementation detail — the contract does +not constrain naming. The conventions above exist so the registry +key is uniquely derivable from the request path and (where +applicable) body without ambiguity. + +## Caching + +Both `authenticate` and `authorise` results are cached at the +gateway, on different policies: + +- **`authenticate`** — cached by a hash of the credential. The OSS + gateway uses a fixed short TTL (currently 60 s) so that revoked + API keys and disabled users stop working within the TTL window + without any push mechanism. Regimes that want a different + behaviour can return an `expires` hint with the identity; the + gateway honours the smaller of `expires` and its own ceiling. + +- **`authorise`** — cached by a hash of `(handle, capability, + resource, parameters)`. The regime returns a suggested TTL with + the decision; the gateway clamps it above by a deployment-set + ceiling (currently 60 s). Both allow and deny decisions are + cached; denies briefly, to avoid hammering the regime with + repeated rejected attempts. + +The TTL ceiling caps the revocation latency window — a role +revoked at the regime takes effect at the gateway no later than +the ceiling. Operators that need stricter revocation can lower +the ceiling. + +## Failure modes + +| Condition | Behaviour | +|---|---| +| `authenticate` returns AuthFailure | Gateway responds 401 with the masked `auth failure` body. | +| `authorise` returns deny | Gateway responds 403 with the masked `access denied` body. | +| IAM regime unreachable | Gateway responds 401 / 503 (deployment-defined). No fail-open. | +| `authorise_many` partial deny | Gateway treats the request as denied; the operation is rejected. Partial-success semantics are not part of the contract. | +| Regime returns "not supported" for a management operation | Gateway responds 501. | + +There is no fallback or "soft" decision path. An IAM regime that +is unavailable, slow, or returning errors causes requests to fail +closed. + +## Implementations + +### Open-source role-based regime + +Defined in [`iam-protocol.md`](iam-protocol.md). Implements the +contract via: + +- A pub/sub request/response service (`iam-svc`) reached only by + the gateway over the message bus. +- Credentials are API keys (opaque) or JWTs (Ed25519, locally + validated by the gateway against the regime's published public + key). +- `authorise` reduces to a lookup against the role bundles in + [`capabilities.md`](capabilities.md), with each grant's workspace + scope checked against the operation's workspace component. +- Identity, user, and workspace records live in Cassandra. + +The OSS regime is deliberately simple — three roles, a single +workspace association per user (a regime data-model decision, not +a contract assertion), no policy language. Other regimes can +grant the same user different permissions in different workspaces +without changing anything outside the regime. + +### Future regimes + +The contract is shaped to admit, without code change in the +gateway: + +- **OIDC / SSO** — `authenticate` validates an OIDC ID token via + the IdP's JWKS; `Identity.handle` carries the verified subject + and group claims; `authorise` evaluates against group-to- + capability mappings configured at the regime. +- **ABAC / Policy engine** — `authorise` calls out to a policy + engine (Rego, Cedar, custom DSL) with the identity's attributes + and the resource as the policy input. +- **ReBAC (Zanzibar-style)** — `authorise` translates `(identity, + capability, resource)` into a relationship-tuple lookup against + a tuple store. +- **Hybrid** — multiple regimes composed: e.g. authenticate via + SSO, authorise via local policy. + +None of these require gateway changes. The contract surface is +the same; the regime is what differs. + +## References + +- [Identity and Access Management Specification](iam.md) — overall + design and the gateway-side framing. +- [IAM Service Protocol Specification](iam-protocol.md) — the OSS + regime's wire-level protocol. +- [Capability Vocabulary Specification](capabilities.md) — the + capability strings the gateway uses as `authorise` input. diff --git a/docs/tech-specs/iam-protocol.md b/docs/tech-specs/iam-protocol.md new file mode 100644 index 00000000..e7e7984e --- /dev/null +++ b/docs/tech-specs/iam-protocol.md @@ -0,0 +1,386 @@ +--- +layout: default +title: "IAM Service Protocol Technical Specification" +parent: "Tech Specs" +--- + +# IAM Service Protocol Technical Specification + +## Overview + +This document specifies the wire protocol of the **open-source IAM +regime** — one implementation of the abstract IAM contract defined +in [`iam-contract.md`](iam-contract.md). Other regimes (OIDC / SSO, +ABAC, ReBAC, external policy engines) implement the same contract +with different transports, data models, and policy semantics; the +gateway is unaware of which regime it's wired against. + +The OSS regime is a backend processor (`iam-svc`) reached over the +standard request/response pub/sub pattern. It owns users, +workspaces, API keys, login credentials, and JWT signing keys, all +backed by Cassandra. The API gateway is its only caller. + +This document defines: + +- the `IamRequest` and `IamResponse` dataclasses on the bus, +- the operation set the OSS regime implements, +- per-operation input and output fields, +- the error taxonomy, +- the bootstrap modes, +- the initial HTTP forwarding endpoint used while the protocol is + being exercised. + +The mapping from this regime onto the abstract contract is direct: + +| Contract operation | OSS regime operation | +|---|---| +| `authenticate(credential)` | `resolve-api-key` (for API keys); local JWT validation against `get-signing-key-public` (for JWTs) | +| `authorise(identity, capability, resource, parameters)` | Role-table lookup against the OSS role bundles defined in [`capabilities.md`](capabilities.md), gated by workspace scope. Workspace can come from the resource address (workspace- and flow-level resources) or from a parameter (system-level resources whose parameters reference a workspace, e.g. `create-user with workspace association W`). | +| `authorise_many` | Loop over `authorise` | +| Identity / credential / workspace management | `create-user`, `create-api-key`, etc. as listed below. These are operations on system-level resources (the user / workspace / credential registries); workspace, where it appears in the body, is a parameter. | + +Architectural context — roles, capabilities, workspace as resource +scope, enforcement boundary — lives in [`iam.md`](iam.md) and +[`capabilities.md`](capabilities.md). The contract abstraction +lives in [`iam-contract.md`](iam-contract.md). + +## Transport + +- **Request topic:** `request:tg/request/iam-request` +- **Response topic:** `response:tg/response/iam-response` +- **Pattern:** request/response, correlated by the `id` message + property, the same pattern used by `config-svc` and `flow-svc`. +- **Caller:** the API gateway only. Under the enforcement-boundary + policy (see capabilities spec), the IAM service trusts the bus + and performs no per-request authentication or capability check + against the caller. The gateway has already evaluated capability + membership and workspace scoping before sending the request. + +## Dataclasses + +### `IamRequest` + +```python +@dataclass +class IamRequest: + # One of the operation strings below. + operation: str = "" + + # Scope of this request. Required on every workspace-scoped + # operation. Omitted (or empty) for system-level ops + # (workspace CRUD, signing-key ops, bootstrap, resolve-api-key, + # login). + workspace: str = "" + + # Acting user id. Set by the gateway to the authenticated + # caller's identity handle for every authenticated request + # (overwrites any caller-supplied value — the gateway is the + # only authority for actor identity, so handlers can rely on it + # being authentic). Used for audit logging, self-service ops + # like ``whoami`` that resolve "the caller", and future actor- + # scoped policy checks. Empty for unauthenticated ops + # (``login``, ``bootstrap``, ``bootstrap-status``, + # ``get-signing-key-public``, ``resolve-api-key``). See the + # actor-injection rule in the IAM contract spec. + actor: str = "" + + # --- identity selectors --- + user_id: str = "" + username: str = "" # login; unique within a workspace + key_id: str = "" # revoke-api-key, list-api-keys (own) + api_key: str = "" # resolve-api-key (plaintext) + + # --- credentials --- + password: str = "" # login, change-password (current) + new_password: str = "" # change-password + + # --- user fields --- + user: UserInput | None = None # create-user, update-user + + # --- workspace fields --- + workspace_record: WorkspaceInput | None = None # create-workspace, update-workspace + + # --- api key fields --- + key: ApiKeyInput | None = None # create-api-key +``` + +### `IamResponse` + +```python +@dataclass +class IamResponse: + # Populated on success of operations that return them. + user: UserRecord | None = None # create-user, get-user, update-user + users: list[UserRecord] = field(default_factory=list) # list-users + workspace: WorkspaceRecord | None = None # create-workspace, get-workspace, update-workspace + workspaces: list[WorkspaceRecord] = field(default_factory=list) # list-workspaces + + # create-api-key returns the plaintext once. Never populated + # on any other operation. + api_key_plaintext: str = "" + api_key: ApiKeyRecord | None = None # create-api-key + api_keys: list[ApiKeyRecord] = field(default_factory=list) # list-api-keys + + # login, rotate-signing-key + jwt: str = "" + jwt_expires: str = "" # ISO-8601 UTC + + # get-signing-key-public + signing_key_public: str = "" # PEM + + # resolve-api-key returns who this key authenticates as. + resolved_user_id: str = "" + resolved_workspace: str = "" + resolved_roles: list[str] = field(default_factory=list) + + # reset-password + temporary_password: str = "" # returned once to the operator + + # bootstrap: on first run, the initial admin's one-time API key + # is returned for the operator to capture. + bootstrap_admin_user_id: str = "" + bootstrap_admin_api_key: str = "" + + # bootstrap-status: true iff an unconsumed ``bootstrap`` call + # would currently succeed. Always emitted by the response + # translator (the false case is meaningful for first-run UIs). + bootstrap_available: bool = False + + # Present on any failed operation. + error: Error | None = None +``` + +### Value types + +```python +@dataclass +class UserInput: + username: str = "" + name: str = "" + email: str = "" + password: str = "" # only on create-user; never on update-user + roles: list[str] = field(default_factory=list) + enabled: bool = True + must_change_password: bool = False + +@dataclass +class UserRecord: + id: str = "" + workspace: str = "" + username: str = "" + name: str = "" + email: str = "" + roles: list[str] = field(default_factory=list) + enabled: bool = True + must_change_password: bool = False + created: str = "" # ISO-8601 UTC + # Password hash is never included in any response. + +@dataclass +class WorkspaceInput: + id: str = "" + name: str = "" + enabled: bool = True + +@dataclass +class WorkspaceRecord: + id: str = "" + name: str = "" + enabled: bool = True + created: str = "" # ISO-8601 UTC + +@dataclass +class ApiKeyInput: + user_id: str = "" + name: str = "" # operator-facing label, e.g. "laptop" + expires: str = "" # optional ISO-8601 UTC; empty = no expiry + +@dataclass +class ApiKeyRecord: + id: str = "" + user_id: str = "" + name: str = "" + prefix: str = "" # first 4 chars of plaintext, for identification in lists + expires: str = "" # empty = no expiry + created: str = "" + last_used: str = "" # empty if never used + # key_hash is never included in any response. +``` + +## Operations + +| Operation | Request fields | Response fields | Notes | +|---|---|---|---| +| `login` | `username`, `password`, `workspace` (optional) | `jwt`, `jwt_expires` | If `workspace` omitted, IAM resolves to the user's assigned workspace. | +| `whoami` | `actor` (gateway-injected) | `user` | Returns the calling user's own record. AUTHENTICATED-only; no `users:read` capability required. | +| `resolve-api-key` | `api_key` (plaintext) | `resolved_user_id`, `resolved_workspace`, `resolved_roles` | Gateway-internal. Service returns `auth-failed` for unknown / expired / revoked keys. | +| `change-password` | `user_id`, `password` (current), `new_password` | — | Self-service. IAM validates `password` against stored hash. | +| `reset-password` | `user_id`, `workspace` (optional integrity check) | `temporary_password` | Admin-initiated. IAM generates a random password, sets `must_change_password=true` on the user, returns the plaintext once. | +| `create-user` | `workspace`, `user` | `user` | `user.password` is hashed and stored; `user.roles` must be subset of known roles. `workspace` is the new user's home-workspace binding (a required *parameter*, not an address). | +| `list-users` | `workspace` (optional filter) | `users` | If `workspace` omitted, returns the deployment-wide list. | +| `get-user` | `user_id`, `workspace` (optional integrity check) | `user` | | +| `update-user` | `user_id`, `user`, `workspace` (optional integrity check) | `user` | `password` field on `user` is rejected; use `change-password` / `reset-password`. Username is immutable. | +| `disable-user` | `user_id`, `workspace` (optional integrity check) | — | Soft-delete; sets `enabled=false`. Revokes all the user's API keys. | +| `enable-user` | `user_id`, `workspace` (optional integrity check) | — | Re-enables a previously disabled user; does not restore API keys. | +| `delete-user` | `user_id`, `workspace` (optional integrity check) | — | Hard-delete; removes user record, username lookup, and all the user's API keys. | +| `create-workspace` | `workspace_record` | `workspace` | System-level. | +| `list-workspaces` | — | `workspaces` | System-level. | +| `get-workspace` | `workspace_record` (id only) | `workspace` | System-level. | +| `update-workspace` | `workspace_record` | `workspace` | System-level. | +| `disable-workspace` | `workspace_record` (id only) | — | System-level. Sets `enabled=false`; revokes all workspace API keys; disables all users in the workspace. | +| `create-api-key` | `key`, `workspace` (optional integrity check) | `api_key_plaintext`, `api_key` | Plaintext returned **once**; only hash stored. `key.name` required. | +| `list-api-keys` | `user_id`, `workspace` (optional integrity check) | `api_keys` | | +| `revoke-api-key` | `key_id`, `workspace` (optional integrity check) | — | Deletes the key record. | +| `get-signing-key-public` | — | `signing_key_public` | Gateway fetches this at startup. | +| `rotate-signing-key` | — | — | System-level. Introduces a new signing key; old key continues to validate JWTs for a grace period (implementation-defined, minimum 1h). | +| `bootstrap` | — | `bootstrap_admin_user_id`, `bootstrap_admin_api_key` | If IAM tables are empty and the service is in `bootstrap` mode, creates the initial `default` workspace, an `admin` user, an initial API key, and an initial signing key; returns them once. Otherwise returns a masked auth failure. | +| `bootstrap-status` | — | `bootstrap_available` | Side-effect-free probe; `true` iff iam-svc is in `bootstrap` mode and tables are empty. Intended for first-run UX. | + +## Error taxonomy + +All errors are carried in the `IamResponse.error` field. `error.type` +is one of the values below; `error.message` is a human-readable +string that is **not** surfaced verbatim to external callers (the +gateway maps to `auth failure` / `access denied` per the IAM error +policy). + +| `type` | When | +|---|---| +| `invalid-argument` | Malformed request (missing required field, unknown operation, invalid format). | +| `not-found` | Named resource does not exist (`user_id`, `key_id`, workspace). | +| `duplicate` | Create operation collides with an existing resource (username, workspace id, key name). | +| `auth-failed` | `login` with wrong credentials; `resolve-api-key` with unknown / expired / revoked key; `change-password` with wrong current password. Single bucket to deny oracle attacks. | +| `weak-password` | Password does not meet policy (length, complexity — policy defined at service level). | +| `disabled` | Target user or workspace has `enabled=false`. | +| `operation-not-permitted` | Non-admin attempting system-level operation, or workspace-scoped operation attempting to affect another workspace. | +| `internal-error` | Unexpected IAM-side failure. Log and surface as 500 at the gateway. | + +The gateway is responsible for translating `auth-failed` and +`operation-not-permitted` into the obfuscated external error +response (`"auth failure"` / `"access denied"`); `invalid-argument` +becomes a descriptive 400; `not-found` / `duplicate` / +`weak-password` / `disabled` become descriptive 4xx but never leak +IAM-internal detail. + +## Credential storage + +- **Passwords** are stored using a slow KDF (bcrypt / argon2id — the + service picks; documented as an implementation detail). The + `password_hash` column stores the full KDF-encoded string + (algorithm, cost, salt, hash). Not a plain SHA-256. +- **API keys** are stored as SHA-256 of the plaintext. API keys + are 128-bit random values (`tg_` + base64url); the entropy + makes a slow hash unnecessary. The hash serves as the primary + key on the `iam_api_keys` table, enabling O(1) lookup on + `resolve-api-key`. +- **JWT signing key** is stored as an RSA or Ed25519 private key + (implementation choice) in a dedicated `iam_signing_keys` table + with a `kid`, `created`, and optional `retired` timestamp. At + most one active key; up to N retired keys are kept for a grace + period to validate previously-issued JWTs. + +Passwords, API-key plaintext, and signing-key private material are +never returned in any response other than the explicit one-time +responses above (`reset-password`, `create-api-key`, `bootstrap`). + +## Bootstrap modes + +`iam-svc` requires a bootstrap mode to be chosen at startup. There is +no default — an unset or invalid mode causes the service to refuse +to start. The purpose is to force the operator to make an explicit +security decision rather than rely on an implicit "safe" fallback. + +| Mode | Startup behaviour | `bootstrap` operation | Suitability | +|---|---|---|---| +| `token` | On first start with empty tables, auto-seeds the `default` workspace, admin user, admin API key (using the operator-provided `--bootstrap-token`), and an initial signing key. No-op on subsequent starts. | Refused — returns `auth-failed` / `"auth failure"` regardless of caller. | Production, any public-exposure deployment. | +| `bootstrap` | No startup seeding. Tables remain empty until the `bootstrap` operation is invoked over the pub/sub bus (typically via `tg-bootstrap-iam`). | Live while tables are empty. Generates and returns the admin API key once. Refused (`auth-failed`) once tables are populated. | Dev / compose up / CI. **Not safe under public exposure** — any caller reaching the gateway's `/api/v1/iam` forwarder before the operator can cause a token to be issued to them. Operators choosing this mode accept that risk. | + +### Error masking + +In both modes, any refused invocation of the `bootstrap` operation +returns the same error (`auth-failed` / `"auth failure"`). A caller +cannot distinguish: + +- "service is in token mode" +- "service is in bootstrap mode but already bootstrapped" +- "operation forbidden" + +This matches the general IAM error-policy stance (see `iam.md`) and +prevents externally enumerating IAM's state. + +### Configuration sources + +The mode and token can be supplied two ways. Resolution order is +fixed; there is no permissive fallback. + +| Source | Field | +|---|---| +| Processor-group YAML / CLI argument | `bootstrap_mode`, `bootstrap_token` | +| Environment variable | `IAM_BOOTSTRAP_MODE`, `IAM_BOOTSTRAP_TOKEN` | + +For each setting the service uses the explicit param value if +present; otherwise the environment variable; otherwise the service +refuses to start. The env-var path is intended for the K8s +deployment pattern where the token is injected from a `Secret` via +`secretKeyRef`, so the plaintext never has to live in YAML or git. +A typical production manifest holds `bootstrap_mode: "token"` in +the YAML and pulls `IAM_BOOTSTRAP_TOKEN` from the Secret; the YAML +is then safe to version-control. + +### Bootstrap-token lifecycle + +The bootstrap token — whether operator-supplied (`token` mode) or +service-generated (`bootstrap` mode) — is a one-time credential. It +is stored as admin's single API key, tagged `name="bootstrap"`. The +operator's first admin action after bootstrap should be: + +1. Create a durable admin user and API key (or issue a durable API + key to the bootstrap admin). +2. Revoke the bootstrap key via `revoke-api-key`. +3. Remove the bootstrap token from any deployment configuration + (Secret, env var, or YAML field — wherever it was sourced). + +The `name="bootstrap"` marker makes bootstrap keys easy to detect in +tooling (e.g. a `tg-list-api-keys` filter). + +## HTTP forwarding (initial integration) + +For the initial gateway integration — before the IAM service is +wired into the authentication middleware — the gateway exposes a +single forwarding endpoint: + +``` +POST /api/v1/iam +``` + +- Request body is a JSON encoding of `IamRequest`. +- Response body is a JSON encoding of `IamResponse`. +- The gateway's existing authentication (`GATEWAY_SECRET` bearer) + gates access to this endpoint so the IAM protocol can be + exercised end-to-end in tests without touching the live auth + path. +- This endpoint is **not** the final shape. Once the middleware is + in place, per-operation REST endpoints replace it (for example + `POST /api/v1/auth/login`, `POST /api/v1/users`, `DELETE + /api/v1/api-keys/{id}`), and this generic forwarder is removed. + +The endpoint performs only message marshalling: it does not read +or rewrite fields in the request, and it applies no capability +check. All authorisation for user / workspace / key management +lands in the subsequent middleware work. + +## Non-goals for this spec + +- REST endpoint shape for the final gateway surface — covered in + Phase 2 of the IAM implementation plan, not here. +- OIDC / SAML external IdP protocol — out of scope for open source. +- Key-signing algorithm choice, password KDF choice, JWT claim + layout — implementation details captured in code + ADRs, not + locked in the protocol spec. + +## References + +- [IAM Contract Specification](iam-contract.md) — the abstract + gateway↔IAM regime contract this protocol implements. +- [Identity and Access Management Specification](iam.md) +- [Capability Vocabulary Specification](capabilities.md) diff --git a/docs/tech-specs/iam.md b/docs/tech-specs/iam.md index 5de50749..dd0e12f5 100644 --- a/docs/tech-specs/iam.md +++ b/docs/tech-specs/iam.md @@ -199,9 +199,9 @@ The server rejects all non-auth messages until authentication succeeds. The socket remains open on auth failure, allowing the client to retry with a different token without reconnecting. The client can also send a new auth message at any time to re-authenticate — for example, to -refresh an expiring JWT or to switch workspace. The -resolved identity (user, workspace, roles) is updated on each -successful auth. +refresh an expiring JWT or to switch workspace. The resolved +identity (handle, workspace, principal_id, source) is updated on +each successful auth. #### API keys @@ -219,7 +219,7 @@ For programmatic access: CLI tools, scripts, and integrations. On each request, the gateway resolves an API key by: 1. Hashing the token. -2. Checking a local cache (hash → user/workspace/roles). +2. Checking a local cache (hash → identity). 3. On cache miss, calling the IAM service to resolve. 4. Caching the result with a short TTL (e.g. 60 seconds). @@ -233,9 +233,15 @@ For interactive access via the UI or WebSocket connections. - A user logs in with username and password. The gateway forwards the request to the IAM service, which validates the credentials and returns a signed JWT. -- The JWT carries the user ID, workspace, and roles as claims. +- The JWT carries identity-binding claims only — user id (`sub`) + and the workspace this credential authenticates to. No roles, + no policy state. Per the IAM contract, all policy decisions go + through `authorise`; the gateway never reads roles or other + regime-internal state from the credential. - The gateway validates JWTs locally using the IAM service's public - signing key — no service call needed on subsequent requests. + signing key — no service call needed for the authentication step; + authorisation calls remain per-request (cached per the contract's + caching rules). - Token expiry is enforced by standard JWT validation at the time the request (or WebSocket connection) is made. - For long-lived WebSocket connections, the JWT is validated at connect @@ -262,6 +268,26 @@ The gateway forwards this to the IAM service, which validates credentials and returns a signed JWT. The gateway returns the JWT to the caller. +#### Self-service: `whoami` and `bootstrap-status` + +Two side-effect-free probes that exist to support UI affordances +without giving the caller broad read access: + +- `POST /api/v1/iam` with `{"operation": "whoami"}` — authenticated + only. Returns the caller's own user record (id, username, name, + email, workspace, roles, enabled, must_change_password, + created). No `users:read` capability is required, because every + authenticated caller can read themselves. The gateway populates + `actor` on the request from the authenticated identity, so the + regime resolves "the caller" without taking a target argument. + +- `POST /api/v1/auth/bootstrap-status` — public, side-effect-free. + Returns `{"bootstrap_available": true|false}`. `true` iff + iam-svc is in `bootstrap` mode and its tables are empty (i.e. an + unconsumed `bootstrap` call would currently succeed). Exists so + a first-run UI can decide whether to render the setup flow + without invoking the consuming `bootstrap` op. + #### IAM service delegation The gateway stays thin. Its authentication logic is: @@ -285,35 +311,82 @@ authentication uses API keys or JWTs. On first start, the bootstrap process creates a default workspace and admin user with an initial API key. -### User identity +### Identity, credentials, and workspace binding -A user belongs to exactly one workspace. The design supports extending -this to multi-workspace access in the future (see -[Extension points](#extension-points)). +The gateway never asks "which workspace does *this user* belong to?". +That question forces every IAM regime to expose a user-to-workspace +mapping, which prevents regimes where the relationship is many-to-many +or doesn't exist (e.g. SSO with IdP-driven workspace selection). +Instead, the gateway asks "which workspace does *this credential* +authenticate to?" — a question every regime can answer in its own +terms. -A user record contains: +A credential (API key, JWT, OIDC token, etc.) is **bound to a +workspace at issue time**. The IAM regime decides what binding +means: + +- **OSS regime** — each user has a home workspace; credentials + issued to that user are bound to that workspace. A 1:1 + user-to-workspace constraint is an internal data-model decision, + not a contract assertion. +- **Multi-workspace regime** (future / enterprise) — a user with + access to several workspaces gets a different credential per + workspace. Each credential authenticates to exactly one + workspace; the relationship between user and workspace is a + regime-internal detail the gateway does not see. + +When the gateway authenticates a credential, the IAM regime returns +an `Identity` whose `workspace` is the workspace this credential is +for. That value — not "the user's workspace" — is what the gateway +uses for default-fill-in and as input to the IAM `authorise` call. + +#### Identity surface + +What the gateway holds after `authenticate`: + +| Field | Purpose | +|-------|---------| +| `handle` | Opaque token quoted back when calling `authorise`. Regime-defined. | +| `workspace` | The workspace this credential authenticates to. Used as the default if a request omits workspace. | +| `principal_id` | Stable identifier for audit logging (a user id, sub claim, service account id). Never used for authorisation. | +| `source` | How the credential was presented (`api-key`, `jwt`). Logged with audit events; not policy input. | + +Anything else — roles, claims, group memberships, policy attributes +— stays inside the regime and is reachable only via `authorise`. +See [`iam-contract.md`](iam-contract.md) for the full contract. + +#### OSS user record + +The OSS regime stores the following per user. These fields are +**OSS-implementation specifics**, not part of the contract. | Field | Type | Description | |-------|------|-------------| | `id` | string | Unique user identifier (UUID) | | `name` | string | Display name | | `email` | string | Email address (optional) | -| `workspace` | string | Workspace the user belongs to | +| `workspace` | string | Home workspace; default binding for issued credentials | | `roles` | list[string] | Assigned roles (e.g. `["reader"]`) | | `enabled` | bool | Whether the user can authenticate | | `created` | datetime | Account creation timestamp | -The `workspace` field maps to the existing `user` field in `Metadata`. -This means the storage-layer isolation (Cassandra, Neo4j, Qdrant -filtering by `user` + `collection`) works without changes — the gateway -sets the `user` metadata field to the authenticated user's workspace. +The `workspace` field on a user record is the **default binding** +used when issuing credentials, not a constraint visible to the +gateway. An enterprise regime may have no user records at all +(authentication delegated to an IdP). ### Workspaces -A workspace is an isolated data boundary. Users belong to a workspace, -and all data operations are scoped to it. Workspaces map to the existing -`user` field in `Metadata` and the corresponding Cassandra keyspace, -Qdrant collection prefix, and Neo4j property filters. +A workspace is an isolated data boundary — a tenancy scope in which +users, flows, configuration, documents, and knowledge graphs live. +Workspaces map to storage-layer isolation: the `user` field in +`Metadata`, the corresponding Cassandra keyspace, the Qdrant +collection prefix, the Neo4j property filter. + +Workspace is the most prominent component of an operation's +**resource scope**: when a request says "do X to Y", workspace is +part of "Y". Listing users, creating flows, querying the graph — +all of these target a specific workspace. | Field | Type | Description | |-------|------|-------------| @@ -322,57 +395,176 @@ Qdrant collection prefix, and Neo4j property filters. | `enabled` | bool | Whether the workspace is active | | `created` | datetime | Creation timestamp | -All data operations are scoped to a workspace. The gateway determines -the effective workspace for each request as follows: +#### Default-fill-in -1. If the request includes a `workspace` parameter, validate it against - the user's assigned workspace. - - If it matches, use it. - - If it does not match, return 403. (This could be extended to - check a workspace access grant list.) -2. If no `workspace` parameter is provided, use the user's assigned - workspace. +If a request omits workspace, the gateway fills it in from the +authenticated identity's bound workspace (`identity.workspace`) +before any IAM check runs. IAM never receives an unresolved +workspace; every `authorise` call sees a concrete value. -The gateway sets the `user` field in `Metadata` to the effective -workspace ID, replacing the caller-supplied `?user=` query parameter. +#### Authorisation -This design ensures forward compatibility. Clients that pass a -workspace parameter will work unchanged if multi-workspace support is -added later. Requests for an unassigned workspace get a clear 403 -rather than silent misbehaviour. +Whether the resolved workspace is permitted to be operated on by +this caller is an **IAM decision**, not a gateway one. The gateway +calls `authorise(identity, capability, {workspace: ..., ...})` and +relays the answer. In the OSS regime, the regime checks whether +the caller's permission grants for `` include this +workspace — see [`capabilities.md`](capabilities.md). In other +regimes the decision could come from group mappings, policies, +relationship tuples, or anything else the regime models. + +### Request anatomy + +The shape of a request — where workspace appears, where flow +appears, where parameters live — follows from **the level of the +resource being operated on**, not from any single property of the +request like its URL or its required capability. + +Resources live at one of three levels (see also the resource model +in [`iam-contract.md`](iam-contract.md)): + +| Resource level | Examples | Resource address | +|---|---|---| +| **System** | The user registry, the workspace registry, the IAM signing key, the audit log | empty `{}` | +| **Workspace** | A workspace's config, flow definitions, library, knowledge cores, collections | `{workspace: ...}` | +| **Flow** | A flow's knowledge graph, agent state, LLM context, embeddings, MCP context | `{workspace: ..., flow: ...}` | + +For the gateway-to-bus mapping this dictates **where workspace +lives in the message**, but only when workspace is part of the +*resource address*. Workspace can also appear as an *operation +parameter* on system-level resources (see below). + +#### Workspace as address vs. parameter + +Two distinct roles, two distinct locations: + +- **Workspace as address component.** Workspace identifies the + thing being operated on. Used for workspace-level and flow-level + resources. Lives in the addressing layer of the message — the + URL path for HTTP, or the WebSocket envelope alongside `flow` for + flow-scoped operations sent through the Mux. +- **Workspace as operation parameter.** Workspace is data the + operation acts on, while the resource itself is system-level. + Used for operations on the user registry (`create-user with + workspace association W`), the workspace registry (`create- + workspace W`), and other system-level operations that happen to + reference a workspace. Lives in the request body or inner WS + payload alongside the operation's other parameters. + +The two roles never coexist on the same operation. Either the +operation addresses something within a workspace (workspace is in +the address) or it operates on a system-level resource with +workspace as a parameter (workspace is in the body) or workspace +is irrelevant (system-level operations like `bootstrap`, +`rotate-signing-key`, `login` itself). + +#### Where workspace lives, by request type + +| Request type | Resource level | Workspace lives in | +|---|---|---| +| Flow-scoped data plane (`agent`, `graph-rag`, `llm`, `embeddings`, `mcp`, etc.) | Flow | Envelope alongside `flow` (WS) or URL path (HTTP) — part of the address | +| Workspace-scoped control plane (`config`, `library`, `knowledge`, `collection-management`, flow lifecycle) | Workspace | Body / inner request — part of the address | +| User registry ops (`create-user`, `list-users`, `disable-user`, etc.) | System | Body — as a *parameter* (the user's workspace association or a list filter) | +| Workspace registry ops (`create-workspace`, `list-workspaces`, etc.) | System | Body — as a *parameter* (the workspace identifier in `workspace_record`) | +| Credential ops (`create-api-key`, `revoke-api-key`, `change-password`, `reset-password`) | System | Body — as a *parameter* on ops that have one; absent on `change-password` (target is the caller's identity) | +| System ops (`bootstrap`, `login`, `rotate-signing-key`, `get-signing-key-public`) | System | Not present at all | + +The classification is deliberate. Users are a global concept that +*have* a workspace; they don't *live* in one. An OSS regime has +1:1 user-to-workspace; a multi-workspace regime maps a user to many +workspaces; an SSO regime might delegate workspace membership to an +IdP entirely. The gateway treats user-registry operations as +system-level so the contract is the same across regimes — the +workspace association is a parameter the regime interprets in its +own terms. + +#### HTTP + +HTTP routes by URL path, so the address lives in the URL. +Per-operation REST shape: + +- Flow-level: `POST /api/v1/workspaces/{w}/flows/{f}/services/{kind}` + — `workspace` and `flow` are URL components. +- Workspace-level: `POST /api/v1/workspaces/{w}/config`, + `/api/v1/workspaces/{w}/library`, etc. — `workspace` is a URL + component. +- System-level: `POST /api/v1/users`, `/api/v1/workspaces`, etc. — + no workspace in URL; if the operation references one, it's a + field in the body. + +`/api/v1/iam` is itself registry-driven: the body's `operation` +field is looked up against the registry to obtain the capability, +resource shape, and parameter shape per operation, rather than +gating the whole endpoint with a single coarse capability. + +#### WebSocket Mux + +The Mux envelope is the addressing layer for flow-scoped +operations. For workspace-level and system-level operations the +envelope routes by `service` only, and the inner request payload +carries the address components or parameters as appropriate. See +[`iam-contract.md`](iam-contract.md) for the operation-registry +mechanism the Mux uses to know which fields to read. ### Roles and access control -Three roles with fixed permissions: +Roles are an OSS-regime concept and live entirely in the IAM +service. The gateway does not enumerate or check them — it asks +`authorise(identity, capability, resource, parameters)` per +request and the regime maps the caller's roles to a decision. -| Role | Data operations | Admin operations | System | -|------|----------------|-----------------|--------| -| `reader` | Query knowledge graph, embeddings, RAG | None | None | -| `writer` | All reader operations + load documents, manage collections | None | None | -| `admin` | All writer operations | Config, flows, collection management, user management | Metrics | +The OSS regime ships three roles: -Role checks happen at the gateway before dispatching to backend -services. Each endpoint declares the minimum role required: +| Role | Capabilities granted | +|------|----------------------| +| `reader` | Read capabilities on data and config (`graph:read`, `documents:read`, `rows:read`, `config:read`, `flows:read`, `knowledge:read`, `collections:read`, `keys:self`, plus the per-service caps `agent`, `llm`, `embeddings`, `mcp`). | +| `writer` | All reader capabilities, plus `graph:write`, `documents:write`, `rows:write`, `knowledge:write`, `collections:write`. | +| `admin` | All writer capabilities, plus `config:write`, `flows:write`, `users:read`, `users:write`, `users:admin`, `keys:admin`, `workspaces:admin`, `iam:admin`, `metrics:read`. | -| Endpoint pattern | Minimum role | -|-----------------|--------------| -| `GET /api/v1/socket` (queries) | `reader` | -| `POST /api/v1/librarian` | `writer` | -| `POST /api/v1/flow/*/import/*` | `writer` | -| `POST /api/v1/config` | `admin` | -| `GET /api/v1/flow/*` | `admin` | -| `GET /api/metrics` | `admin` | +Workspace scope is a property of the *grant*, not of the user or +role. In the OSS regime each capability granted by `reader` / +`writer` is scoped to the workspace the user record is associated +with; capabilities granted by `admin` are scoped to `*` (every +workspace). A user is a system-level object — they don't "live +in" a workspace, they hold permissions whose scope happens to +reference one. -Roles are hierarchical: `admin` implies `writer`, which implies -`reader`. +The OSS regime is deliberately limited to one workspace association +per user; future regimes are free to grant the same user different +permissions in different workspaces, or use a non-workspace scope +entirely. This is regime-internal — neither the contract nor the +gateway carries an assumption either way. + +The gateway gates each endpoint by *capability*, not by role. +Capabilities are declared per operation in the gateway's operation +registry; see [`iam-contract.md`](iam-contract.md) for the +registry mechanism and [`capabilities.md`](capabilities.md) for +the capability vocabulary. ### IAM service -The IAM service is a new backend service that manages all identity and -access data. It is the authority for users, workspaces, API keys, and -credentials. The gateway delegates to it. +The IAM service is a backend service that implements the +[IAM contract](iam-contract.md) — `authenticate`, `authorise`, and +the management operations the gateway forwards. It is the +authority for identity, credential validation, and access decisions. +The gateway treats it as a black box behind the contract; nothing +in the gateway is regime-specific. -#### Data model +The OSS distribution ships one IAM regime: a role-based service +backed by Cassandra, described in +[`iam-protocol.md`](iam-protocol.md). Enterprise / future regimes +can replace this implementation without changing the gateway, the +wire protocol between gateway and backends, or the capability +vocabulary — see the contract spec for the abstraction the gateway +is wired against and the implementation notes for what other +regimes look like. + +#### OSS data model + +The OSS regime stores users, workspaces, API keys, and signing +keys in Cassandra. This is an **OSS regime implementation +detail**; it is not part of the contract. Other regimes will have +different (or no) data models. ``` iam_workspaces ( @@ -423,44 +615,89 @@ resolve API keys and to handle login requests. User management operations (create user, revoke key, etc.) also go through the IAM service. +### Error policy + +External error responses carry **no diagnostic detail** for +authentication or access-control failures. The goal is to give an +attacker probing the endpoint no signal about which condition they +tripped. + +| Category | HTTP | Body | WebSocket frame | +|----------|------|------|-----------------| +| Authentication failure | `401 Unauthorized` | `{"error": "auth failure"}` | `{"type": "auth-failed", "error": "auth failure"}` | +| Access control failure | `403 Forbidden` | `{"error": "access denied"}` | `{"error": "access denied"}` (endpoint-specific frame type) | + +"Authentication failure" covers missing credential, malformed +credential, invalid signature, expired token, revoked API key, and +unknown API key — all indistinguishable to the caller. + +"Access control failure" covers role insufficient, workspace +mismatch, user disabled, and workspace disabled — all +indistinguishable to the caller. + +**Server-side logging is richer.** The audit log records the specific +reason (`"workspace-mismatch: user alice assigned 'acme', requested +'beta'"`, `"role-insufficient: admin required, user has writer"`, +etc.) for operators and post-incident forensics. These messages never +appear in responses. + +Other error classes (bad request, internal error) remain descriptive +because they do not reveal anything about the auth or access-control +surface — e.g. `"missing required field 'workspace'"` or +`"invalid JSON"` is fine. + ### Gateway changes -The current `Authenticator` class is replaced with a thin authentication -middleware that delegates to the IAM service: +The current `Authenticator` class is replaced with a thin +authentication+authorisation middleware that delegates to the IAM +service per the IAM contract. The gateway performs no role check +itself — authorisation is asked of the regime via `authorise`. For HTTP requests: 1. Extract Bearer token from the `Authorization` header. 2. If the token has JWT format (dotted structure): - Validate signature locally using the cached public key. - - Extract user ID, workspace, and roles from claims. + - Build an `Identity` from `sub` and `workspace` claims (no + other claims are consulted). 3. Otherwise, treat as an API key: - Hash the token and check the local cache. - - On cache miss, call the IAM service to resolve. - - Cache the result (user/workspace/roles) with a short TTL. + - On cache miss, call the IAM service to resolve to an + `Identity` (handle, workspace, principal_id, source). + - Cache the result with a short TTL. 4. If neither succeeds, return 401. -5. If the user or workspace is disabled, return 403. -6. Check the user's role against the endpoint's minimum role. If - insufficient, return 403. -7. Resolve the effective workspace: - - If the request includes a `workspace` parameter, validate it - against the user's assigned workspace. Return 403 on mismatch. - - If no `workspace` parameter, use the user's assigned workspace. -8. Set the `user` field in the request context to the effective - workspace ID. This propagates through `Metadata` to all downstream - services. +5. Look up the operation in the gateway's operation registry to get + `(capability, resource_level, extractors)`. Build the resource + address (system / workspace / flow level) and parameters from + the request. +6. Default-fill the workspace into the body when the operation is + workspace- or flow-level (so downstream code sees a single + canonical address); the resource address keeps its supplied + value. +7. Call `authorise(identity, capability, resource, parameters)`. + On allow, forward the request; on deny, return 403. On regime + error, fail closed (401 / 503 per deployment). +8. Cache the decision per the contract's caching rules (clamped + above by a deployment-set ceiling). +9. For requests forwarded to iam-svc, set `actor` on the body + from `identity.handle`, overwriting any caller-supplied value. + See [`iam-contract.md`](iam-contract.md#actor-injection). For WebSocket connections: 1. Accept the connection in an unauthenticated state. 2. Wait for an auth message (`{"type": "auth", "token": "..."}`). -3. Validate the token using the same logic as steps 2-7 above. +3. Validate the token using the same logic as steps 1-3 above. 4. On success, attach the resolved identity to the connection and send `{"type": "auth-ok", ...}`. 5. On failure, send `{"type": "auth-failed", ...}` but keep the socket open. 6. Reject all non-auth messages until authentication succeeds. 7. Accept new auth messages at any time to re-authenticate. +8. For each subsequent request frame, look up + `flow-service:` in the registry and call `authorise` + against the `{workspace, flow}` resource — same authority + gateway HTTP callers see, evaluated per-frame. ### CLI changes @@ -713,6 +950,16 @@ These are not implemented but the architecture does not preclude them: - **Multi-workspace access.** Users could be granted access to additional workspaces beyond their primary assignment. The workspace validation step checks a grant list instead of a single assignment. +- **Workspace resolver.** Workspace resolution on each authenticated + request — "given this user and this requested workspace, which + workspace (if any) may the request operate on?" — is encapsulated + in a single pluggable resolver. The open-source edition ships a + resolver that permits only the user's single assigned workspace; + enterprise editions that implement multi-workspace access swap in a + resolver that consults a permitted set. The wire protocol (the + optional `workspace` field on the authenticated request) is + identical in both editions, so clients written against one edition + work unchanged against the other. - **Rules-based access control.** A separate access control service could evaluate fine-grained policies (per-collection permissions, operation-level restrictions, time-based access). The gateway @@ -848,10 +1095,15 @@ service, not in the config service. Reasons: - **API key scoping.** API keys could be scoped to specific collections within a workspace rather than granting workspace-wide access. To be designed when the need arises. -- **tg-init-trustgraph** only initialises a single workspace. ## References +- [IAM Contract Specification](iam-contract.md) — the gateway↔IAM + regime abstraction this design is wired against. +- [IAM Service Protocol Specification](iam-protocol.md) — the OSS + regime's wire-level protocol. +- [Capability Vocabulary Specification](capabilities.md) — the + capability strings the gateway uses as `authorise` input. - [Data Ownership and Information Separation](data-ownership-model.md) - [MCP Tool Bearer Token Specification](mcp-tool-bearer-token.md) - [Multi-Tenant Support Specification](multi-tenant-support.md) diff --git a/iam-testing.txt b/iam-testing.txt new file mode 100644 index 00000000..0d03ffc3 --- /dev/null +++ b/iam-testing.txt @@ -0,0 +1,252 @@ + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation": "bootstrap"}' + + + + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation": "resolve-api-key", "api_key": "tg_r-n43hDWV9WOY06w6o5YpevAxirlS33D"}' + + + + + + + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation": "resolve-api-key", "api_key": "asdalsdjasdkasdasda"}' + + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation":"list-users","workspace":"default"}' + + + + # 1. Admin creates a writer user "alice" + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{ + "operation": "create-user", + "workspace": "default", + "user": { + "username": "alice", + "name": "Alice", + "email": "alice@example.com", + "password": "changeme", + "roles": ["writer"] + } + }' + # expect: {"user": {"id": "", ...}} — grab alice's uuid + + # 2. Issue alice an API key + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{ + "operation": "create-api-key", + "workspace": "default", + "key": { + "user_id": "f2363a10-3b83-44ea-a008-43caae8ba607", + "name": "alice-laptop" + } + }' + # expect: {"api_key_plaintext": "tg_...", "api_key": {"id": "", "prefix": "tg_xxxx", ...}} + + # 3. Resolve alice's key — should return alice's id + workspace + writer role + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation":"resolve-api-key","api_key":"tg_gt4buvk5NG-QS7oP_0Gk5yTWyj1qensf"}' + + # expect: {"resolved_user_id":"","resolved_workspace":"default","resolved_roles":["writer"]} + + # 4. List alice's keys (admin view of alice's keys) + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation":"list-api-keys","workspace":"default","user_id":"f2363a10-3b83-44ea-a008-43caae8ba607"}' + # expect: {"api_keys": [{"id":"","user_id":"","name":"alice-laptop","prefix":"tg_xxxx",...}]} + + # 5. Revoke alice's key + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation":"revoke-api-key","workspace":"default","key_id":"55f1c1f7-5448-49fd-9eda-56c192b61177"}' + + + # expect: {} (empty, no error) + + # 6. Confirm the revoked key no longer resolves + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation":"resolve-api-key","api_key":"tg_gt4buvk5NG-QS7oP_0Gk5yTWyj1qensf"}' + # expect: {"error":{"type":"auth-failed","message":"unknown api key"}} + + + +---------------------------------------------------------------------------- + + You'll want to re-bootstrap a fresh deployment to pick up the new signing-key row (or accept that login will lazily generate one on first + call). Then: + + # 1. Create a user with a known password (admin's password is random) + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation":"create-user","workspace":"default","user":{"username":"alice","password":"s3cret","roles":["writer"]}}' + + + + # 2. Log alice in + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation":"login","username":"alice","password":"s3cret"}' + # expect: {"jwt":"eyJ...","jwt_expires":"2026-..."} + + # 3. Fetch the public key (what the gateway will use later to verify) + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation":"get-signing-key-public"}' + + # expect: {"signing_key_public":"-----BEGIN PUBLIC KEY-----\n..."} + + # 4. Wrong password + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Authorization: Bearer $GATEWAY_SECRET" \ + -H "Content-Type: application/json" \ + -d '{"operation":"login","username":"alice","password":"nope"}' + + + + # expect: {"error":{"type":"auth-failed","message":"bad credentials"}} + + + + + +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAseLB/a9Bo/RN/Rb/x763 ++vdxmUKG75oWsXBmbwZGDXyN6fwqZ3L7cEje93qK0PYFuCHxhY1Hn0gW7FZ8ovH+ +qEksekUlpfPYqKGiT5Mb0DKk49D4yKkIbJFugWalpwIilvRbQO0jy3V8knqGQ1xL +NfNYFrI2Rxe0Tq2OHVYc5YwYbyj1nz2TY5fd9qrzXtGRv5HZztkl25lWhRvG9G0K +urKDdBDbi894gIYorXvcwZw/b1GDXG/aUy/By1Oy3hXnCLsN8pA3nA437TTTWxHx +QgPH15jIF9hezO+3/ESZ7EhVEtgmwTxPddfXRa0ZoT6JyWOgcloKtnP4Lp9eQ4va +yQIDAQAB +-----END PUBLIC KEY----- + + + + + + New operations: + - change-password — self-service. Requires current + new password. + - reset-password — admin-driven. Generates a random temporary, sets must_change_password=true, returns plaintext once. + - get-user, update-user, disable-user — workspace-scoped. update-user refuses to change username (immutable — error if different) and refuses + password-via-update. disable-user also revokes all the user's API keys, per spec. + - create-workspace, list-workspaces, get-workspace, update-workspace, disable-workspace — system-level. disable-workspace cascades: disables + all users + revokes all their keys. Rejects ids starting with _ (reserved, per the bootstrap framework convention). + - rotate-signing-key — generates a new Ed25519 key, retires the current one (sets retired timestamp; row stays for future grace-period + validation), switches the in-memory cache. + + Touched files: + - trustgraph-flow/trustgraph/tables/iam.py — added retire_signing_key, update_user_profile, update_user_password, update_user_enabled, + update_workspace. + - trustgraph-flow/trustgraph/iam/service/iam.py — 12 new handlers + dispatch entries. + - trustgraph-base/trustgraph/base/iam_client.py — matching client helpers for all of them. + + Smoke-test suggestions: + + # change password for alice (from "s3cret" → "n3wer") + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation":"change-password","user_id":"b2960feb-caef-401d-af65-01bdb6960cad","password":"s3cret","new_password":"n3wer"}' + + # login with new password + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation":"login","username":"alice","password":"n3wer"}' + + # admin resets alice's password + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation":"reset-password","workspace":"default","user_id":"b2960feb-caef-401d-af65-01bdb6960cad"}' + + + # → {"temporary_password":"..."} + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation":"login","username":"alice","password":"fH2ttyrIcVXCIkH_"}' + + + # create a second workspace + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation":"create-workspace","workspace_record":{"id":"acme","name":"Acme Corp","enabled":true}}' + + + # rotate signing key (next login produces a JWT signed by a new kid) + + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -d '{"operation":"rotate-signing-key"}' + + + + + + + curl -s -X POST "http://localhost:8088/api/v1/flow" \ + -H "Authorization: Bearer tg_bs_kBAhfejiEJmbcO1gElbxk3MpV7wQFygP" \ + -H "Content-Type: application/json" \ + -d '{"operation":"list-flows"}' + + curl -s -X POST "http://localhost:8088/api/v1/iam" \ + -H "Authorization: Bearer tg_bs_kBAhfejiEJmbcO1gElbxk3MpV7wQFygP" \ + -H "Content-Type: application/json" \ + -d '{"operation":"list-users"}' + + + + curl -s -X POST http://localhost:8088/api/v1/iam \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer tg_bs_kBAhfejiEJmbcO1gElbxk3MpV7wQFygP" \ + -d '{ + "operation": "create-user", + "workspace": "default", + "user": { + "username": "alice", + "name": "Alice", + "email": "alice@example.com", + "password": "s3cret", + "roles": ["writer"] + } + }' + + + + + # Login (public, no token needed) → returns a JWT + curl -s -X POST "http://localhost:8088/api/v1/auth/login" \ + -H "Content-Type: application/json" \ + -d '{"username":"alice","password":"s3cret"}' + + + + export TRUSTGRAPH_TOKEN=$(tg-bootstrap-iam) # on fresh bootstrap-mode deployment + # or set to your existing admin API key + + tg-create-user --username alice --roles writer + # → prints alice's user id + + ALICE_ID= + + ALICE_KEY=$(tg-create-api-key --user-id $ALICE_ID --name alice-laptop) + # → alice's plaintext API key + + tg-list-users + tg-list-api-keys --user-id $ALICE_ID + + tg-revoke-api-key --key-id <...> + tg-disable-user --user-id $ALICE_ID + + # User self-service: + tg-login --username alice # prompts for password, prints JWT + tg-change-password # prompts for current + new + + diff --git a/tests/unit/test_embeddings/test_ollama_dynamic_model.py b/tests/unit/test_embeddings/test_ollama_dynamic_model.py index d52a58c6..cfbc4d6e 100644 --- a/tests/unit/test_embeddings/test_ollama_dynamic_model.py +++ b/tests/unit/test_embeddings/test_ollama_dynamic_model.py @@ -14,13 +14,13 @@ from trustgraph.embeddings.ollama.processor import Processor class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase): """Test Ollama dynamic model selection""" - @patch('trustgraph.embeddings.ollama.processor.Client') + @patch('trustgraph.embeddings.ollama.processor.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__') async def test_client_initialized_with_host(self, mock_embeddings_init, mock_async_init, mock_client_class): """Test that Ollama client is initialized with correct host""" # Arrange - mock_ollama_client = Mock() + mock_ollama_client = AsyncMock() mock_response = Mock() mock_response.embeddings = [[0.1, 0.2, 0.3, 0.4, 0.5]] mock_ollama_client.embed.return_value = mock_response @@ -36,13 +36,13 @@ class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase): mock_client_class.assert_called_once_with(host="http://localhost:11434") assert processor.default_model == "test-model" - @patch('trustgraph.embeddings.ollama.processor.Client') + @patch('trustgraph.embeddings.ollama.processor.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__') async def test_on_embeddings_uses_default_model(self, mock_embeddings_init, mock_async_init, mock_client_class): """Test that on_embeddings uses default model when no model specified""" # Arrange - mock_ollama_client = Mock() + mock_ollama_client = AsyncMock() mock_response = Mock() mock_response.embeddings = [[0.1, 0.2, 0.3, 0.4, 0.5]] mock_ollama_client.embed.return_value = mock_response @@ -62,13 +62,13 @@ class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase): ) assert result == [[0.1, 0.2, 0.3, 0.4, 0.5]] - @patch('trustgraph.embeddings.ollama.processor.Client') + @patch('trustgraph.embeddings.ollama.processor.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__') async def test_on_embeddings_uses_specified_model(self, mock_embeddings_init, mock_async_init, mock_client_class): """Test that on_embeddings uses specified model when provided""" # Arrange - mock_ollama_client = Mock() + mock_ollama_client = AsyncMock() mock_response = Mock() mock_response.embeddings = [[0.1, 0.2, 0.3, 0.4, 0.5]] mock_ollama_client.embed.return_value = mock_response @@ -88,13 +88,13 @@ class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase): ) assert result == [[0.1, 0.2, 0.3, 0.4, 0.5]] - @patch('trustgraph.embeddings.ollama.processor.Client') + @patch('trustgraph.embeddings.ollama.processor.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__') async def test_multiple_model_switches(self, mock_embeddings_init, mock_async_init, mock_client_class): """Test switching between multiple models""" # Arrange - mock_ollama_client = Mock() + mock_ollama_client = AsyncMock() mock_response = Mock() mock_response.embeddings = [[0.1, 0.2, 0.3, 0.4, 0.5]] mock_ollama_client.embed.return_value = mock_response @@ -118,13 +118,13 @@ class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase): assert calls[2][1]['model'] == "model-a" assert calls[3][1]['model'] == "test-model" # Default - @patch('trustgraph.embeddings.ollama.processor.Client') + @patch('trustgraph.embeddings.ollama.processor.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__') async def test_none_model_uses_default(self, mock_embeddings_init, mock_async_init, mock_client_class): """Test that None model parameter falls back to default""" # Arrange - mock_ollama_client = Mock() + mock_ollama_client = AsyncMock() mock_response = Mock() mock_response.embeddings = [[0.1, 0.2, 0.3, 0.4, 0.5]] mock_ollama_client.embed.return_value = mock_response @@ -143,13 +143,13 @@ class TestOllamaDynamicModelLoading(IsolatedAsyncioTestCase): input=["test text"] ) - @patch('trustgraph.embeddings.ollama.processor.Client') + @patch('trustgraph.embeddings.ollama.processor.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.embeddings_service.EmbeddingsService.__init__') async def test_initialization_without_model_uses_default(self, mock_embeddings_init, mock_async_init, mock_client_class): """Test initialization without model parameter uses module default""" # Arrange - mock_ollama_client = Mock() + mock_ollama_client = AsyncMock() mock_client_class.return_value = mock_ollama_client mock_async_init.return_value = None mock_embeddings_init.return_value = None diff --git a/tests/unit/test_gateway/test_auth.py b/tests/unit/test_gateway/test_auth.py index d4d4fc2b..26e93fd9 100644 --- a/tests/unit/test_gateway/test_auth.py +++ b/tests/unit/test_gateway/test_auth.py @@ -1,69 +1,447 @@ """ -Tests for Gateway Authentication +Tests for gateway/auth.py — IamAuth, JWT verification, API key +resolution cache. + +JWTs are signed with real Ed25519 keypairs generated per-test, so +the crypto path is exercised end-to-end without mocks. API-key +resolution is tested against a stubbed IamClient since the real +one requires pub/sub. """ +import base64 +import json +import time +from unittest.mock import AsyncMock, Mock, patch + import pytest +from aiohttp import web +from cryptography.hazmat.primitives import serialization +from cryptography.hazmat.primitives.asymmetric import ed25519 -from trustgraph.gateway.auth import Authenticator +from trustgraph.gateway.auth import ( + IamAuth, Identity, + _b64url_decode, _verify_jwt_eddsa, + API_KEY_CACHE_TTL, +) -class TestAuthenticator: - """Test cases for Authenticator class""" +# -- helpers --------------------------------------------------------------- - def test_authenticator_initialization_with_token(self): - """Test Authenticator initialization with valid token""" - auth = Authenticator(token="test-token-123") - - assert auth.token == "test-token-123" - assert auth.allow_all is False - def test_authenticator_initialization_with_allow_all(self): - """Test Authenticator initialization with allow_all=True""" - auth = Authenticator(allow_all=True) - - assert auth.token is None - assert auth.allow_all is True +def _b64url(data: bytes) -> str: + return base64.urlsafe_b64encode(data).rstrip(b"=").decode("ascii") - def test_authenticator_initialization_without_token_raises_error(self): - """Test Authenticator initialization without token raises RuntimeError""" - with pytest.raises(RuntimeError, match="Need a token"): - Authenticator() - def test_authenticator_initialization_with_empty_token_raises_error(self): - """Test Authenticator initialization with empty token raises RuntimeError""" - with pytest.raises(RuntimeError, match="Need a token"): - Authenticator(token="") +def make_keypair(): + priv = ed25519.Ed25519PrivateKey.generate() + public_pem = priv.public_key().public_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PublicFormat.SubjectPublicKeyInfo, + ).decode("ascii") + return priv, public_pem - def test_permitted_with_allow_all_returns_true(self): - """Test permitted method returns True when allow_all is enabled""" - auth = Authenticator(allow_all=True) - - # Should return True regardless of token or roles - assert auth.permitted("any-token", []) is True - assert auth.permitted("different-token", ["admin"]) is True - assert auth.permitted(None, ["user"]) is True - def test_permitted_with_matching_token_returns_true(self): - """Test permitted method returns True with matching token""" - auth = Authenticator(token="secret-token") - - # Should return True when tokens match - assert auth.permitted("secret-token", []) is True - assert auth.permitted("secret-token", ["admin", "user"]) is True +def sign_jwt(priv, claims, alg="EdDSA"): + header = {"alg": alg, "typ": "JWT", "kid": "kid-test"} + h = _b64url(json.dumps(header, separators=(",", ":"), sort_keys=True).encode()) + p = _b64url(json.dumps(claims, separators=(",", ":"), sort_keys=True).encode()) + signing_input = f"{h}.{p}".encode("ascii") + if alg == "EdDSA": + sig = priv.sign(signing_input) + else: + raise ValueError(f"test helper doesn't sign {alg}") + return f"{h}.{p}.{_b64url(sig)}" - def test_permitted_with_non_matching_token_returns_false(self): - """Test permitted method returns False with non-matching token""" - auth = Authenticator(token="secret-token") - - # Should return False when tokens don't match - assert auth.permitted("wrong-token", []) is False - assert auth.permitted("different-token", ["admin"]) is False - assert auth.permitted(None, ["user"]) is False - def test_permitted_with_token_and_allow_all_returns_true(self): - """Test permitted method with both token and allow_all set""" - auth = Authenticator(token="test-token", allow_all=True) - - # allow_all should take precedence - assert auth.permitted("any-token", []) is True - assert auth.permitted("wrong-token", ["admin"]) is True \ No newline at end of file +def make_request(auth_header): + """Minimal stand-in for an aiohttp request — IamAuth only reads + ``request.headers["Authorization"]``.""" + req = Mock() + req.headers = {} + if auth_header is not None: + req.headers["Authorization"] = auth_header + return req + + +# -- pure helpers ---------------------------------------------------------- + + +class TestB64UrlDecode: + + def test_round_trip_without_padding(self): + data = b"hello" + encoded = _b64url(data) + assert _b64url_decode(encoded) == data + + def test_handles_various_lengths(self): + for s in (b"a", b"ab", b"abc", b"abcd", b"abcde"): + assert _b64url_decode(_b64url(s)) == s + + +# -- JWT verification ----------------------------------------------------- + + +class TestVerifyJwtEddsa: + + def test_valid_jwt_passes(self): + priv, pub = make_keypair() + claims = { + "sub": "user-1", "workspace": "default", + "iat": int(time.time()), + "exp": int(time.time()) + 60, + } + token = sign_jwt(priv, claims) + got = _verify_jwt_eddsa(token, pub) + assert got["sub"] == "user-1" + assert got["workspace"] == "default" + + def test_expired_jwt_rejected(self): + priv, pub = make_keypair() + claims = { + "sub": "user-1", "workspace": "default", + "iat": int(time.time()) - 3600, + "exp": int(time.time()) - 1, + } + token = sign_jwt(priv, claims) + with pytest.raises(ValueError, match="expired"): + _verify_jwt_eddsa(token, pub) + + def test_bad_signature_rejected(self): + priv_a, _ = make_keypair() + _, pub_b = make_keypair() + claims = { + "sub": "user-1", "workspace": "default", + "iat": int(time.time()), + "exp": int(time.time()) + 60, + } + token = sign_jwt(priv_a, claims) + # pub_b never signed this token. + with pytest.raises(Exception): + _verify_jwt_eddsa(token, pub_b) + + def test_malformed_jwt_rejected(self): + _, pub = make_keypair() + with pytest.raises(ValueError, match="malformed"): + _verify_jwt_eddsa("not-a-jwt", pub) + + def test_unsupported_algorithm_rejected(self): + priv, pub = make_keypair() + # Manually build an "alg":"HS256" header — no signer needed + # since we expect it to bail before verifying. + header = {"alg": "HS256", "typ": "JWT", "kid": "x"} + payload = { + "sub": "user-1", "workspace": "default", + "iat": int(time.time()), "exp": int(time.time()) + 60, + } + h = _b64url(json.dumps(header, separators=(",", ":")).encode()) + p = _b64url(json.dumps(payload, separators=(",", ":")).encode()) + sig = _b64url(b"not-a-real-sig") + token = f"{h}.{p}.{sig}" + with pytest.raises(ValueError, match="unsupported alg"): + _verify_jwt_eddsa(token, pub) + + +# -- Identity -------------------------------------------------------------- + + +class TestIdentity: + + def test_fields(self): + i = Identity( + handle="u", workspace="w", + principal_id="u", source="api-key", + ) + assert i.handle == "u" + assert i.workspace == "w" + assert i.principal_id == "u" + assert i.source == "api-key" + + +# -- IamAuth.authenticate -------------------------------------------------- + + +class TestIamAuthDispatch: + """``authenticate()`` chooses between the JWT and API-key paths + by shape of the bearer.""" + + @pytest.mark.asyncio + async def test_no_authorization_header_raises_401(self): + auth = IamAuth(backend=Mock()) + with pytest.raises(web.HTTPUnauthorized): + await auth.authenticate(make_request(None)) + + @pytest.mark.asyncio + async def test_non_bearer_header_raises_401(self): + auth = IamAuth(backend=Mock()) + with pytest.raises(web.HTTPUnauthorized): + await auth.authenticate(make_request("Basic whatever")) + + @pytest.mark.asyncio + async def test_empty_bearer_raises_401(self): + auth = IamAuth(backend=Mock()) + with pytest.raises(web.HTTPUnauthorized): + await auth.authenticate(make_request("Bearer ")) + + @pytest.mark.asyncio + async def test_unknown_format_raises_401(self): + # Not tg_... and not dotted-JWT shape. + auth = IamAuth(backend=Mock()) + with pytest.raises(web.HTTPUnauthorized): + await auth.authenticate(make_request("Bearer garbage")) + + @pytest.mark.asyncio + async def test_valid_jwt_resolves_to_identity(self): + priv, pub = make_keypair() + claims = { + "sub": "user-1", "workspace": "default", + "iat": int(time.time()), + "exp": int(time.time()) + 60, + } + token = sign_jwt(priv, claims) + + auth = IamAuth(backend=Mock()) + auth._signing_public_pem = pub + + ident = await auth.authenticate( + make_request(f"Bearer {token}") + ) + assert ident.handle == "user-1" + assert ident.workspace == "default" + assert ident.principal_id == "user-1" + assert ident.source == "jwt" + + @pytest.mark.asyncio + async def test_jwt_without_public_key_fails(self): + # If the gateway hasn't fetched IAM's public key yet, JWTs + # must not validate — even ones that would otherwise pass. + priv, _ = make_keypair() + claims = { + "sub": "user-1", "workspace": "default", + "iat": int(time.time()), "exp": int(time.time()) + 60, + } + token = sign_jwt(priv, claims) + auth = IamAuth(backend=Mock()) + # _signing_public_pem defaults to None + with pytest.raises(web.HTTPUnauthorized): + await auth.authenticate(make_request(f"Bearer {token}")) + + @pytest.mark.asyncio + async def test_api_key_path(self): + auth = IamAuth(backend=Mock()) + + async def fake_resolve(api_key): + assert api_key == "tg_testkey" + # Roles are returned by the regime as a hint but the + # gateway ignores them — kept here so the resolve + # protocol shape is exercised. + return ("user-xyz", "default", ["admin"]) + + async def fake_with_client(op): + return await op(Mock(resolve_api_key=fake_resolve)) + + with patch.object(auth, "_with_client", side_effect=fake_with_client): + ident = await auth.authenticate( + make_request("Bearer tg_testkey") + ) + assert ident.handle == "user-xyz" + assert ident.workspace == "default" + assert ident.principal_id == "user-xyz" + assert ident.source == "api-key" + + @pytest.mark.asyncio + async def test_api_key_rejection_masked_as_401(self): + auth = IamAuth(backend=Mock()) + + async def fake_with_client(op): + raise RuntimeError("auth-failed: unknown api key") + + with patch.object(auth, "_with_client", side_effect=fake_with_client): + with pytest.raises(web.HTTPUnauthorized): + await auth.authenticate( + make_request("Bearer tg_bogus") + ) + + +# -- API key cache --------------------------------------------------------- + + +class TestApiKeyCache: + + @pytest.mark.asyncio + async def test_cache_hit_skips_iam(self): + auth = IamAuth(backend=Mock()) + calls = {"n": 0} + + async def fake_with_client(op): + calls["n"] += 1 + return await op(Mock( + resolve_api_key=AsyncMock( + return_value=("u", "default", ["reader"]), + ) + )) + + with patch.object(auth, "_with_client", side_effect=fake_with_client): + await auth.authenticate(make_request("Bearer tg_k1")) + await auth.authenticate(make_request("Bearer tg_k1")) + await auth.authenticate(make_request("Bearer tg_k1")) + + # Only the first lookup reaches IAM; the rest are cache hits. + assert calls["n"] == 1 + + @pytest.mark.asyncio + async def test_different_keys_are_separately_cached(self): + auth = IamAuth(backend=Mock()) + seen = [] + + async def fake_with_client(op): + async def resolve(plaintext): + seen.append(plaintext) + return ("u-" + plaintext, "default", ["reader"]) + return await op(Mock(resolve_api_key=resolve)) + + with patch.object(auth, "_with_client", side_effect=fake_with_client): + a = await auth.authenticate(make_request("Bearer tg_a")) + b = await auth.authenticate(make_request("Bearer tg_b")) + + assert a.handle == "u-tg_a" + assert b.handle == "u-tg_b" + assert seen == ["tg_a", "tg_b"] + + @pytest.mark.asyncio + async def test_cache_has_ttl_constant_set(self): + # Not a behaviour test — just ensures we don't accidentally + # set TTL to 0 (which would defeat the cache) or to a week. + assert 10 <= API_KEY_CACHE_TTL <= 3600 + + +# -- IamAuth.authorise ----------------------------------------------------- + + +class TestAuthorise: + """``authorise()`` is the gateway's only authorisation entry + point under the IAM contract. It calls iam-svc, caches the + decision for the regime's TTL (clamped above), and raises 403 + on deny / 401 on regime error (fail closed).""" + + def _make_identity(self, handle="u-1", workspace="default"): + return Identity( + handle=handle, workspace=workspace, + principal_id=handle, source="api-key", + ) + + @pytest.mark.asyncio + async def test_allow_returns_no_exception(self): + auth = IamAuth(backend=Mock()) + + async def fake_with_client(op): + return await op(Mock( + authorise=AsyncMock(return_value=(True, 30)), + )) + + with patch.object(auth, "_with_client", side_effect=fake_with_client): + await auth.authorise( + self._make_identity(), + "graph:read", + {"workspace": "default"}, + {}, + ) + + @pytest.mark.asyncio + async def test_deny_raises_403(self): + auth = IamAuth(backend=Mock()) + + async def fake_with_client(op): + return await op(Mock( + authorise=AsyncMock(return_value=(False, 30)), + )) + + with patch.object(auth, "_with_client", side_effect=fake_with_client): + with pytest.raises(web.HTTPForbidden): + await auth.authorise( + self._make_identity(), + "users:admin", + {}, + {"workspace": "acme"}, + ) + + @pytest.mark.asyncio + async def test_regime_error_fails_closed_as_401(self): + # If iam-svc errors, the gateway must NOT silently allow. + auth = IamAuth(backend=Mock()) + + async def fake_with_client(op): + raise RuntimeError("iam-svc down") + + with patch.object(auth, "_with_client", side_effect=fake_with_client): + with pytest.raises(web.HTTPUnauthorized): + await auth.authorise( + self._make_identity(), + "graph:read", + {"workspace": "default"}, + {}, + ) + + @pytest.mark.asyncio + async def test_allow_decision_is_cached(self): + auth = IamAuth(backend=Mock()) + calls = {"n": 0} + + async def fake_with_client(op): + calls["n"] += 1 + return await op(Mock( + authorise=AsyncMock(return_value=(True, 30)), + )) + + with patch.object(auth, "_with_client", side_effect=fake_with_client): + ident = self._make_identity() + for _ in range(5): + await auth.authorise( + ident, "graph:read", {"workspace": "default"}, {}, + ) + + assert calls["n"] == 1 + + @pytest.mark.asyncio + async def test_deny_decision_is_cached(self): + auth = IamAuth(backend=Mock()) + calls = {"n": 0} + + async def fake_with_client(op): + calls["n"] += 1 + return await op(Mock( + authorise=AsyncMock(return_value=(False, 30)), + )) + + with patch.object(auth, "_with_client", side_effect=fake_with_client): + ident = self._make_identity() + for _ in range(5): + with pytest.raises(web.HTTPForbidden): + await auth.authorise( + ident, "users:admin", {}, {"workspace": "acme"}, + ) + + # Denies are cached too — repeated attempts don't re-hit IAM. + assert calls["n"] == 1 + + @pytest.mark.asyncio + async def test_different_resources_cached_separately(self): + auth = IamAuth(backend=Mock()) + calls = {"n": 0} + + async def fake_with_client(op): + calls["n"] += 1 + return await op(Mock( + authorise=AsyncMock(return_value=(True, 30)), + )) + + with patch.object(auth, "_with_client", side_effect=fake_with_client): + ident = self._make_identity() + await auth.authorise( + ident, "graph:read", {"workspace": "a"}, {}, + ) + await auth.authorise( + ident, "graph:read", {"workspace": "b"}, {}, + ) + + # Different resource → different cache key → two IAM calls. + assert calls["n"] == 2 diff --git a/tests/unit/test_gateway/test_capabilities.py b/tests/unit/test_gateway/test_capabilities.py new file mode 100644 index 00000000..102e381e --- /dev/null +++ b/tests/unit/test_gateway/test_capabilities.py @@ -0,0 +1,171 @@ +""" +Tests for gateway/capabilities.py — the thin authorisation surface +under the IAM contract. + +The gateway no longer holds policy state (roles, capability sets, +workspace scopes); those live in iam-svc. These tests cover only +what the gateway shim does itself: PUBLIC / AUTHENTICATED short- +circuiting, default-fill of workspace, and forwarding of capability +checks to ``auth.authorise``. +""" + +import pytest +from aiohttp import web +from unittest.mock import AsyncMock, MagicMock + +from trustgraph.gateway.capabilities import ( + PUBLIC, AUTHENTICATED, + enforce, enforce_workspace, + access_denied, auth_failure, +) + + +# -- test fixtures --------------------------------------------------------- + + +class _Identity: + """Stand-in for auth.Identity — under the IAM contract it has + just ``handle``, ``workspace``, ``principal_id``, ``source``.""" + + def __init__(self, handle="user-1", workspace="default"): + self.handle = handle + self.workspace = workspace + self.principal_id = handle + self.source = "api-key" + + +def _allow_auth(identity=None): + """Build an Auth double that authenticates to ``identity`` and + allows every authorise() call.""" + auth = MagicMock() + auth.authenticate = AsyncMock( + return_value=identity or _Identity(), + ) + auth.authorise = AsyncMock(return_value=None) + return auth + + +def _deny_auth(identity=None): + """Build an Auth double that authenticates but denies authorise.""" + auth = MagicMock() + auth.authenticate = AsyncMock( + return_value=identity or _Identity(), + ) + auth.authorise = AsyncMock(side_effect=access_denied()) + return auth + + +# -- enforce() ------------------------------------------------------------- + + +class TestEnforce: + + @pytest.mark.asyncio + async def test_public_returns_none_no_auth(self): + auth = _allow_auth() + result = await enforce(MagicMock(), auth, PUBLIC) + assert result is None + auth.authenticate.assert_not_called() + auth.authorise.assert_not_called() + + @pytest.mark.asyncio + async def test_authenticated_skips_authorise(self): + identity = _Identity() + auth = _allow_auth(identity) + result = await enforce(MagicMock(), auth, AUTHENTICATED) + assert result is identity + auth.authenticate.assert_awaited_once() + auth.authorise.assert_not_called() + + @pytest.mark.asyncio + async def test_capability_calls_authorise_system_level(self): + identity = _Identity() + auth = _allow_auth(identity) + result = await enforce(MagicMock(), auth, "graph:read") + assert result is identity + auth.authorise.assert_awaited_once_with( + identity, "graph:read", {}, {}, + ) + + @pytest.mark.asyncio + async def test_capability_denied_raises_forbidden(self): + auth = _deny_auth() + with pytest.raises(web.HTTPForbidden): + await enforce(MagicMock(), auth, "users:admin") + + +# -- enforce_workspace() --------------------------------------------------- + + +class TestEnforceWorkspace: + + @pytest.mark.asyncio + async def test_default_fills_from_identity(self): + data = {"operation": "x"} + auth = _allow_auth() + await enforce_workspace(data, _Identity(workspace="default"), auth) + assert data["workspace"] == "default" + + @pytest.mark.asyncio + async def test_caller_supplied_workspace_kept(self): + data = {"workspace": "acme", "operation": "x"} + auth = _allow_auth() + await enforce_workspace(data, _Identity(workspace="default"), auth) + assert data["workspace"] == "acme" + + @pytest.mark.asyncio + async def test_no_capability_skips_authorise(self): + data = {"workspace": "default"} + auth = _allow_auth() + await enforce_workspace(data, _Identity(), auth) + auth.authorise.assert_not_called() + + @pytest.mark.asyncio + async def test_capability_calls_authorise_with_resource(self): + data = {"workspace": "acme"} + identity = _Identity() + auth = _allow_auth(identity) + await enforce_workspace( + data, identity, auth, capability="graph:read", + ) + auth.authorise.assert_awaited_once_with( + identity, "graph:read", {"workspace": "acme"}, {}, + ) + + @pytest.mark.asyncio + async def test_capability_denied_propagates(self): + data = {"workspace": "acme"} + auth = _deny_auth() + with pytest.raises(web.HTTPForbidden): + await enforce_workspace( + data, _Identity(), auth, capability="users:admin", + ) + + @pytest.mark.asyncio + async def test_non_dict_passthrough(self): + auth = _allow_auth() + result = await enforce_workspace("not-a-dict", _Identity(), auth) + assert result == "not-a-dict" + auth.authorise.assert_not_called() + + +# -- helpers --------------------------------------------------------------- + + +class TestResponseHelpers: + + def test_auth_failure_is_401(self): + exc = auth_failure() + assert exc.status == 401 + assert "auth failure" in exc.text + + def test_access_denied_is_403(self): + exc = access_denied() + assert exc.status == 403 + assert "access denied" in exc.text + + +class TestSentinels: + + def test_public_and_authenticated_are_distinct(self): + assert PUBLIC != AUTHENTICATED diff --git a/tests/unit/test_gateway/test_dispatch_manager.py b/tests/unit/test_gateway/test_dispatch_manager.py index f091a46d..e399d712 100644 --- a/tests/unit/test_gateway/test_dispatch_manager.py +++ b/tests/unit/test_gateway/test_dispatch_manager.py @@ -42,7 +42,7 @@ class TestDispatcherManager: mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) assert manager.backend == mock_backend assert manager.config_receiver == mock_config_receiver @@ -59,7 +59,10 @@ class TestDispatcherManager: mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver, prefix="custom-prefix") + manager = DispatcherManager( + mock_backend, mock_config_receiver, + auth=Mock(), prefix="custom-prefix", + ) assert manager.prefix == "custom-prefix" @@ -68,7 +71,7 @@ class TestDispatcherManager: """Test start_flow method""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) flow_data = {"name": "test_flow", "steps": []} @@ -82,7 +85,7 @@ class TestDispatcherManager: """Test stop_flow method""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) # Pre-populate with a flow flow_data = {"name": "test_flow", "steps": []} @@ -96,7 +99,7 @@ class TestDispatcherManager: """Test dispatch_global_service returns DispatcherWrapper""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) wrapper = manager.dispatch_global_service() @@ -107,7 +110,7 @@ class TestDispatcherManager: """Test dispatch_core_export returns DispatcherWrapper""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) wrapper = manager.dispatch_core_export() @@ -118,7 +121,7 @@ class TestDispatcherManager: """Test dispatch_core_import returns DispatcherWrapper""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) wrapper = manager.dispatch_core_import() @@ -130,7 +133,7 @@ class TestDispatcherManager: """Test process_core_import method""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) with patch('trustgraph.gateway.dispatch.manager.CoreImport') as mock_core_import: mock_importer = Mock() @@ -148,7 +151,7 @@ class TestDispatcherManager: """Test process_core_export method""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) with patch('trustgraph.gateway.dispatch.manager.CoreExport') as mock_core_export: mock_exporter = Mock() @@ -166,7 +169,7 @@ class TestDispatcherManager: """Test process_global_service method""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) manager.invoke_global_service = AsyncMock(return_value="global_result") @@ -181,7 +184,7 @@ class TestDispatcherManager: """Test invoke_global_service with existing dispatcher""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) # Pre-populate with existing dispatcher mock_dispatcher = Mock() @@ -198,7 +201,7 @@ class TestDispatcherManager: """Test invoke_global_service creates new dispatcher""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) with patch('trustgraph.gateway.dispatch.manager.global_dispatchers') as mock_dispatchers: mock_dispatcher_class = Mock() @@ -230,7 +233,7 @@ class TestDispatcherManager: """Test dispatch_flow_import returns correct method""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) result = manager.dispatch_flow_import() @@ -240,7 +243,7 @@ class TestDispatcherManager: """Test dispatch_flow_export returns correct method""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) result = manager.dispatch_flow_export() @@ -250,7 +253,7 @@ class TestDispatcherManager: """Test dispatch_socket returns correct method""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) result = manager.dispatch_socket() @@ -260,7 +263,7 @@ class TestDispatcherManager: """Test dispatch_flow_service returns DispatcherWrapper""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) wrapper = manager.dispatch_flow_service() @@ -272,7 +275,7 @@ class TestDispatcherManager: """Test process_flow_import with valid flow and kind""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) # Setup test flow manager.flows[("default", "test_flow")] = { @@ -308,7 +311,7 @@ class TestDispatcherManager: """Test process_flow_import with invalid flow""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) params = {"flow": "invalid_flow", "kind": "triples"} @@ -323,7 +326,7 @@ class TestDispatcherManager: warnings.simplefilter("ignore", RuntimeWarning) mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) # Setup test flow manager.flows[("default", "test_flow")] = { @@ -345,7 +348,7 @@ class TestDispatcherManager: """Test process_flow_export with valid flow and kind""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) # Setup test flow manager.flows[("default", "test_flow")] = { @@ -378,26 +381,47 @@ class TestDispatcherManager: @pytest.mark.asyncio async def test_process_socket(self): - """Test process_socket method""" + """process_socket constructs a Mux with the manager's auth + instance passed through — this is the gateway's trust path + for first-frame WebSocket authentication. A Mux cannot be + built without auth (tested separately); this test pins that + the dispatcher-manager threads the correct auth value into + the Mux constructor call.""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) - + mock_auth = Mock() + manager = DispatcherManager( + mock_backend, mock_config_receiver, auth=mock_auth, + ) + with patch('trustgraph.gateway.dispatch.manager.Mux') as mock_mux: mock_mux_instance = Mock() mock_mux.return_value = mock_mux_instance - + result = await manager.process_socket("ws", "running", {}) - - mock_mux.assert_called_once_with(manager, "ws", "running") + + mock_mux.assert_called_once_with( + manager, "ws", "running", auth=mock_auth, + ) assert result == mock_mux_instance + def test_dispatcher_manager_requires_auth(self): + """Constructing a DispatcherManager without an auth argument + must fail — a no-auth DispatcherManager would produce a + Mux without authentication, silently downgrading the socket + auth path.""" + mock_backend = Mock() + mock_config_receiver = Mock() + + with pytest.raises(ValueError, match="auth"): + DispatcherManager(mock_backend, mock_config_receiver, auth=None) + @pytest.mark.asyncio async def test_process_flow_service(self): """Test process_flow_service method""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) manager.invoke_flow_service = AsyncMock(return_value="flow_result") @@ -412,7 +436,7 @@ class TestDispatcherManager: """Test invoke_flow_service with existing dispatcher""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) # Add flow to the flows dictionary manager.flows[("default", "test_flow")] = {"services": {"agent": {}}} @@ -432,7 +456,7 @@ class TestDispatcherManager: """Test invoke_flow_service creates request-response dispatcher""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) # Setup test flow manager.flows[("default", "test_flow")] = { @@ -476,7 +500,7 @@ class TestDispatcherManager: """Test invoke_flow_service creates sender dispatcher""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) # Setup test flow manager.flows[("default", "test_flow")] = { @@ -516,7 +540,7 @@ class TestDispatcherManager: """Test invoke_flow_service with invalid flow""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) with pytest.raises(RuntimeError, match="Invalid flow"): await manager.invoke_flow_service("data", "responder", "default", "invalid_flow", "agent") @@ -526,7 +550,7 @@ class TestDispatcherManager: """Test invoke_flow_service with kind not supported by flow""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) # Setup test flow without agent interface manager.flows[("default", "test_flow")] = { @@ -543,7 +567,7 @@ class TestDispatcherManager: """Test invoke_flow_service with invalid kind""" mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) # Setup test flow with interface but unsupported kind manager.flows[("default", "test_flow")] = { @@ -570,7 +594,7 @@ class TestDispatcherManager: """ mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) async def slow_start(): # Yield to the event loop so other coroutines get a chance to run, @@ -606,7 +630,7 @@ class TestDispatcherManager: """ mock_backend = Mock() mock_config_receiver = Mock() - manager = DispatcherManager(mock_backend, mock_config_receiver) + manager = DispatcherManager(mock_backend, mock_config_receiver, auth=Mock()) manager.flows[("default", "test_flow")] = { "interfaces": { diff --git a/tests/unit/test_gateway/test_dispatch_mux.py b/tests/unit/test_gateway/test_dispatch_mux.py index a0bc9460..c1baa920 100644 --- a/tests/unit/test_gateway/test_dispatch_mux.py +++ b/tests/unit/test_gateway/test_dispatch_mux.py @@ -12,6 +12,19 @@ from trustgraph.gateway.dispatch.mux import Mux, MAX_QUEUE_SIZE class TestMux: """Test cases for Mux class""" + def test_mux_requires_auth(self): + """Constructing a Mux without an ``auth`` argument must + fail. The Mux implements the first-frame auth protocol and + there is no no-auth mode — a no-auth Mux would silently + accept every frame without authenticating it.""" + with pytest.raises(ValueError, match="auth"): + Mux( + dispatcher_manager=MagicMock(), + ws=MagicMock(), + running=MagicMock(), + auth=None, + ) + def test_mux_initialization(self): """Test Mux initialization""" mock_dispatcher_manager = MagicMock() @@ -21,7 +34,8 @@ class TestMux: mux = Mux( dispatcher_manager=mock_dispatcher_manager, ws=mock_ws, - running=mock_running + running=mock_running, + auth=MagicMock(), ) assert mux.dispatcher_manager == mock_dispatcher_manager @@ -40,7 +54,8 @@ class TestMux: mux = Mux( dispatcher_manager=mock_dispatcher_manager, ws=mock_ws, - running=mock_running + running=mock_running, + auth=MagicMock(), ) # Call destroy @@ -61,7 +76,8 @@ class TestMux: mux = Mux( dispatcher_manager=mock_dispatcher_manager, ws=None, - running=mock_running + running=mock_running, + auth=MagicMock(), ) # Call destroy @@ -81,7 +97,8 @@ class TestMux: mux = Mux( dispatcher_manager=mock_dispatcher_manager, ws=mock_ws, - running=mock_running + running=mock_running, + auth=MagicMock(), ) # Mock message with valid JSON @@ -108,7 +125,8 @@ class TestMux: mux = Mux( dispatcher_manager=mock_dispatcher_manager, ws=mock_ws, - running=mock_running + running=mock_running, + auth=MagicMock(), ) # Mock message without request field @@ -137,7 +155,8 @@ class TestMux: mux = Mux( dispatcher_manager=mock_dispatcher_manager, ws=mock_ws, - running=mock_running + running=mock_running, + auth=MagicMock(), ) # Mock message without id field @@ -164,7 +183,8 @@ class TestMux: mux = Mux( dispatcher_manager=mock_dispatcher_manager, ws=mock_ws, - running=mock_running + running=mock_running, + auth=MagicMock(), ) # Mock message with invalid JSON diff --git a/tests/unit/test_gateway/test_endpoint_constant.py b/tests/unit/test_gateway/test_endpoint_constant.py index f208c967..98588e55 100644 --- a/tests/unit/test_gateway/test_endpoint_constant.py +++ b/tests/unit/test_gateway/test_endpoint_constant.py @@ -13,29 +13,36 @@ class TestConstantEndpoint: """Test cases for ConstantEndpoint class""" def test_constant_endpoint_initialization(self): - """Test ConstantEndpoint initialization""" + """Construction records the configured capability on the + instance. The capability is a required argument — no + permissive default — and the test passes an explicit + value to demonstrate the contract.""" mock_auth = MagicMock() mock_dispatcher = MagicMock() - + endpoint = ConstantEndpoint( endpoint_path="/api/test", auth=mock_auth, - dispatcher=mock_dispatcher + dispatcher=mock_dispatcher, + capability="config:read", ) - + assert endpoint.path == "/api/test" assert endpoint.auth == mock_auth assert endpoint.dispatcher == mock_dispatcher - assert endpoint.operation == "service" + assert endpoint.capability == "config:read" @pytest.mark.asyncio async def test_constant_endpoint_start_method(self): """Test ConstantEndpoint start method (should be no-op)""" mock_auth = MagicMock() mock_dispatcher = MagicMock() - - endpoint = ConstantEndpoint("/api/test", mock_auth, mock_dispatcher) - + + endpoint = ConstantEndpoint( + "/api/test", mock_auth, mock_dispatcher, + capability="config:read", + ) + # start() should complete without error await endpoint.start() @@ -44,10 +51,13 @@ class TestConstantEndpoint: mock_auth = MagicMock() mock_dispatcher = MagicMock() mock_app = MagicMock() - - endpoint = ConstantEndpoint("/api/test", mock_auth, mock_dispatcher) + + endpoint = ConstantEndpoint( + "/api/test", mock_auth, mock_dispatcher, + capability="config:read", + ) endpoint.add_routes(mock_app) - + # Verify add_routes was called with POST route mock_app.add_routes.assert_called_once() # The call should include web.post with the path and handler diff --git a/tests/unit/test_gateway/test_endpoint_i18n.py b/tests/unit/test_gateway/test_endpoint_i18n.py index ab693cdf..c2b51568 100644 --- a/tests/unit/test_gateway/test_endpoint_i18n.py +++ b/tests/unit/test_gateway/test_endpoint_i18n.py @@ -1,4 +1,12 @@ -"""Tests for Gateway i18n pack endpoint.""" +"""Tests for Gateway i18n pack endpoint. + +Production registers this endpoint with ``capability=PUBLIC``: the +login UI needs to render its own i18n strings before any user has +authenticated, so the endpoint is deliberately pre-auth. These +tests exercise the PUBLIC configuration — that is the production +contract. Behaviour of authenticated endpoints is covered by the +IamAuth tests in ``test_auth.py``. +""" import json from unittest.mock import MagicMock @@ -7,6 +15,7 @@ import pytest from aiohttp import web from trustgraph.gateway.endpoint.i18n import I18nPackEndpoint +from trustgraph.gateway.capabilities import PUBLIC class TestI18nPackEndpoint: @@ -17,23 +26,28 @@ class TestI18nPackEndpoint: endpoint = I18nPackEndpoint( endpoint_path="/api/v1/i18n/packs/{lang}", auth=mock_auth, + capability=PUBLIC, ) assert endpoint.path == "/api/v1/i18n/packs/{lang}" assert endpoint.auth == mock_auth - assert endpoint.operation == "service" + assert endpoint.capability == PUBLIC @pytest.mark.asyncio async def test_i18n_endpoint_start_method(self): mock_auth = MagicMock() - endpoint = I18nPackEndpoint("/api/v1/i18n/packs/{lang}", mock_auth) + endpoint = I18nPackEndpoint( + "/api/v1/i18n/packs/{lang}", mock_auth, capability=PUBLIC, + ) await endpoint.start() def test_add_routes_registers_get_handler(self): mock_auth = MagicMock() mock_app = MagicMock() - endpoint = I18nPackEndpoint("/api/v1/i18n/packs/{lang}", mock_auth) + endpoint = I18nPackEndpoint( + "/api/v1/i18n/packs/{lang}", mock_auth, capability=PUBLIC, + ) endpoint.add_routes(mock_app) mock_app.add_routes.assert_called_once() @@ -41,35 +55,55 @@ class TestI18nPackEndpoint: assert len(call_args) == 1 @pytest.mark.asyncio - async def test_handle_unauthorized_on_invalid_auth_scheme(self): + async def test_handle_returns_pack_without_authenticating(self): + """The PUBLIC endpoint serves the language pack without + invoking the auth handler at all — pre-login UI must be + reachable. The test uses an auth mock that raises if + touched, so any auth attempt by the endpoint is caught.""" mock_auth = MagicMock() - mock_auth.permitted.return_value = True - endpoint = I18nPackEndpoint("/api/v1/i18n/packs/{lang}", mock_auth) + def _should_not_be_called(*args, **kwargs): + raise AssertionError( + "PUBLIC endpoint must not invoke auth.authenticate" + ) + mock_auth.authenticate = _should_not_be_called + + endpoint = I18nPackEndpoint( + "/api/v1/i18n/packs/{lang}", mock_auth, capability=PUBLIC, + ) request = MagicMock() request.path = "/api/v1/i18n/packs/en" + # A caller-supplied Authorization header of any form should + # be ignored — PUBLIC means we don't look at it. request.headers = {"Authorization": "Token abc"} request.match_info = {"lang": "en"} - resp = await endpoint.handle(request) - assert isinstance(resp, web.HTTPUnauthorized) - - @pytest.mark.asyncio - async def test_handle_returns_pack_when_permitted(self): - mock_auth = MagicMock() - mock_auth.permitted.return_value = True - - endpoint = I18nPackEndpoint("/api/v1/i18n/packs/{lang}", mock_auth) - - request = MagicMock() - request.path = "/api/v1/i18n/packs/en" - request.headers = {} - request.match_info = {"lang": "en"} - resp = await endpoint.handle(request) assert resp.status == 200 payload = json.loads(resp.body.decode("utf-8")) assert isinstance(payload, dict) assert "cli.verify_system_status.title" in payload + + @pytest.mark.asyncio + async def test_handle_rejects_path_traversal(self): + """The ``lang`` path parameter is reflected through to the + filesystem-backed pack loader. The endpoint contains an + explicit defense against ``/`` and ``..`` in the value; this + test pins that defense in place.""" + mock_auth = MagicMock() + endpoint = I18nPackEndpoint( + "/api/v1/i18n/packs/{lang}", mock_auth, capability=PUBLIC, + ) + + for bad in ("../../etc/passwd", "en/../fr", "a/b"): + request = MagicMock() + request.path = f"/api/v1/i18n/packs/{bad}" + request.headers = {} + request.match_info = {"lang": bad} + + resp = await endpoint.handle(request) + assert isinstance(resp, web.HTTPBadRequest), ( + f"path-traversal defense did not reject lang={bad!r}" + ) diff --git a/tests/unit/test_gateway/test_endpoint_manager.py b/tests/unit/test_gateway/test_endpoint_manager.py index 4766f8d7..8f659b71 100644 --- a/tests/unit/test_gateway/test_endpoint_manager.py +++ b/tests/unit/test_gateway/test_endpoint_manager.py @@ -12,30 +12,24 @@ class TestEndpointManager: """Test cases for EndpointManager class""" def test_endpoint_manager_initialization(self): - """Test EndpointManager initialization creates all endpoints""" + """EndpointManager wires up the full endpoint set and + records dispatcher_manager / timeout on the instance.""" mock_dispatcher_manager = MagicMock() mock_auth = MagicMock() - - # Mock dispatcher methods - mock_dispatcher_manager.dispatch_global_service.return_value = MagicMock() - mock_dispatcher_manager.dispatch_socket.return_value = MagicMock() - mock_dispatcher_manager.dispatch_flow_service.return_value = MagicMock() - mock_dispatcher_manager.dispatch_flow_import.return_value = MagicMock() - mock_dispatcher_manager.dispatch_flow_export.return_value = MagicMock() - mock_dispatcher_manager.dispatch_core_import.return_value = MagicMock() - mock_dispatcher_manager.dispatch_core_export.return_value = MagicMock() - + + # The dispatcher_manager exposes a small set of factory + # methods — MagicMock auto-creates them, returning fresh + # MagicMocks on each call. manager = EndpointManager( dispatcher_manager=mock_dispatcher_manager, auth=mock_auth, prometheus_url="http://prometheus:9090", - timeout=300 + timeout=300, ) - + assert manager.dispatcher_manager == mock_dispatcher_manager assert manager.timeout == 300 - assert manager.services == {} - assert len(manager.endpoints) > 0 # Should have multiple endpoints + assert len(manager.endpoints) > 0 def test_endpoint_manager_with_default_timeout(self): """Test EndpointManager with default timeout value""" @@ -79,9 +73,17 @@ class TestEndpointManager: prometheus_url="http://test:9090" ) - # Verify all dispatcher methods were called during initialization + # Each dispatcher factory is invoked once per endpoint that + # needs a dedicated wire. dispatch_auth_iam is shared by + # two endpoints — AuthEndpoints (login / bootstrap / + # change-password) and IamEndpoint (registry-driven + # /api/v1/iam) — so it's expected to be called twice. + # Both forwarders pin the dispatcher to kind=iam and reuse + # the same factory; they're distinct from + # dispatch_global_service (the generic /api/v1/{kind} route). mock_dispatcher_manager.dispatch_global_service.assert_called_once() - mock_dispatcher_manager.dispatch_socket.assert_called() # Called twice + assert mock_dispatcher_manager.dispatch_auth_iam.call_count == 2 + mock_dispatcher_manager.dispatch_socket.assert_called_once() mock_dispatcher_manager.dispatch_flow_service.assert_called_once() mock_dispatcher_manager.dispatch_flow_import.assert_called_once() mock_dispatcher_manager.dispatch_flow_export.assert_called_once() diff --git a/tests/unit/test_gateway/test_endpoint_metrics.py b/tests/unit/test_gateway/test_endpoint_metrics.py index bacf551d..6d911bbd 100644 --- a/tests/unit/test_gateway/test_endpoint_metrics.py +++ b/tests/unit/test_gateway/test_endpoint_metrics.py @@ -12,31 +12,35 @@ class TestMetricsEndpoint: """Test cases for MetricsEndpoint class""" def test_metrics_endpoint_initialization(self): - """Test MetricsEndpoint initialization""" + """Construction records the configured capability on the + instance. In production MetricsEndpoint is gated by + 'metrics:read' so that's the natural value to pass.""" mock_auth = MagicMock() - + endpoint = MetricsEndpoint( prometheus_url="http://prometheus:9090", endpoint_path="/metrics", - auth=mock_auth + auth=mock_auth, + capability="metrics:read", ) - + assert endpoint.prometheus_url == "http://prometheus:9090" assert endpoint.path == "/metrics" assert endpoint.auth == mock_auth - assert endpoint.operation == "service" + assert endpoint.capability == "metrics:read" @pytest.mark.asyncio async def test_metrics_endpoint_start_method(self): """Test MetricsEndpoint start method (should be no-op)""" mock_auth = MagicMock() - + endpoint = MetricsEndpoint( prometheus_url="http://localhost:9090", endpoint_path="/metrics", - auth=mock_auth + auth=mock_auth, + capability="metrics:read", ) - + # start() should complete without error await endpoint.start() @@ -44,15 +48,16 @@ class TestMetricsEndpoint: """Test add_routes method registers GET route with wildcard path""" mock_auth = MagicMock() mock_app = MagicMock() - + endpoint = MetricsEndpoint( prometheus_url="http://prometheus:9090", endpoint_path="/metrics", - auth=mock_auth + auth=mock_auth, + capability="metrics:read", ) - + endpoint.add_routes(mock_app) - + # Verify add_routes was called with GET route mock_app.add_routes.assert_called_once() # The call should include web.get with wildcard path pattern diff --git a/tests/unit/test_gateway/test_endpoint_socket.py b/tests/unit/test_gateway/test_endpoint_socket.py index 83eb38c2..189bc32b 100644 --- a/tests/unit/test_gateway/test_endpoint_socket.py +++ b/tests/unit/test_gateway/test_endpoint_socket.py @@ -1,5 +1,12 @@ """ -Tests for Gateway Socket Endpoint +Tests for Gateway Socket Endpoint. + +In production the only SocketEndpoint registered with HTTP-layer +auth is ``/api/v1/socket`` using ``capability=AUTHENTICATED`` with +``in_band_auth=True`` (first-frame auth over the websocket frames, +not at the handshake). The tests below use AUTHENTICATED as the +representative capability; construction / worker / listener +behaviour is independent of which capability is configured. """ import pytest @@ -7,41 +14,47 @@ from unittest.mock import MagicMock, AsyncMock from aiohttp import WSMsgType from trustgraph.gateway.endpoint.socket import SocketEndpoint +from trustgraph.gateway.capabilities import AUTHENTICATED class TestSocketEndpoint: """Test cases for SocketEndpoint class""" def test_socket_endpoint_initialization(self): - """Test SocketEndpoint initialization""" + """Construction records the configured capability on the + instance. No permissive default is applied.""" mock_auth = MagicMock() mock_dispatcher = MagicMock() - + endpoint = SocketEndpoint( endpoint_path="/api/socket", auth=mock_auth, - dispatcher=mock_dispatcher + dispatcher=mock_dispatcher, + capability=AUTHENTICATED, ) - + assert endpoint.path == "/api/socket" assert endpoint.auth == mock_auth assert endpoint.dispatcher == mock_dispatcher - assert endpoint.operation == "socket" + assert endpoint.capability == AUTHENTICATED @pytest.mark.asyncio async def test_worker_method(self): """Test SocketEndpoint worker method""" mock_auth = MagicMock() mock_dispatcher = AsyncMock() - - endpoint = SocketEndpoint("/api/socket", mock_auth, mock_dispatcher) - + + endpoint = SocketEndpoint( + "/api/socket", mock_auth, mock_dispatcher, + capability=AUTHENTICATED, + ) + mock_ws = MagicMock() mock_running = MagicMock() - + # Call worker method await endpoint.worker(mock_ws, mock_dispatcher, mock_running) - + # Verify dispatcher.run was called mock_dispatcher.run.assert_called_once() @@ -50,8 +63,11 @@ class TestSocketEndpoint: """Test SocketEndpoint listener method with text message""" mock_auth = MagicMock() mock_dispatcher = AsyncMock() - - endpoint = SocketEndpoint("/api/socket", mock_auth, mock_dispatcher) + + endpoint = SocketEndpoint( + "/api/socket", mock_auth, mock_dispatcher, + capability=AUTHENTICATED, + ) # Mock websocket with text message mock_msg = MagicMock() @@ -80,8 +96,11 @@ class TestSocketEndpoint: """Test SocketEndpoint listener method with binary message""" mock_auth = MagicMock() mock_dispatcher = AsyncMock() - - endpoint = SocketEndpoint("/api/socket", mock_auth, mock_dispatcher) + + endpoint = SocketEndpoint( + "/api/socket", mock_auth, mock_dispatcher, + capability=AUTHENTICATED, + ) # Mock websocket with binary message mock_msg = MagicMock() @@ -110,8 +129,11 @@ class TestSocketEndpoint: """Test SocketEndpoint listener method with close message""" mock_auth = MagicMock() mock_dispatcher = AsyncMock() - - endpoint = SocketEndpoint("/api/socket", mock_auth, mock_dispatcher) + + endpoint = SocketEndpoint( + "/api/socket", mock_auth, mock_dispatcher, + capability=AUTHENTICATED, + ) # Mock websocket with close message mock_msg = MagicMock() diff --git a/tests/unit/test_gateway/test_endpoint_stream.py b/tests/unit/test_gateway/test_endpoint_stream.py index b99946c8..a3b49465 100644 --- a/tests/unit/test_gateway/test_endpoint_stream.py +++ b/tests/unit/test_gateway/test_endpoint_stream.py @@ -12,48 +12,57 @@ class TestStreamEndpoint: """Test cases for StreamEndpoint class""" def test_stream_endpoint_initialization_with_post(self): - """Test StreamEndpoint initialization with POST method""" + """Construction records the configured capability on the + instance. StreamEndpoint is used in production for the + core-import / core-export / document-stream routes; a + document-write capability is a realistic value for a POST + stream (e.g. core-import).""" mock_auth = MagicMock() mock_dispatcher = MagicMock() - + endpoint = StreamEndpoint( endpoint_path="/api/stream", auth=mock_auth, dispatcher=mock_dispatcher, - method="POST" + capability="documents:write", + method="POST", ) - + assert endpoint.path == "/api/stream" assert endpoint.auth == mock_auth assert endpoint.dispatcher == mock_dispatcher - assert endpoint.operation == "service" + assert endpoint.capability == "documents:write" assert endpoint.method == "POST" def test_stream_endpoint_initialization_with_get(self): - """Test StreamEndpoint initialization with GET method""" + """GET stream — export-style endpoint, read capability.""" mock_auth = MagicMock() mock_dispatcher = MagicMock() - + endpoint = StreamEndpoint( endpoint_path="/api/stream", auth=mock_auth, dispatcher=mock_dispatcher, - method="GET" + capability="documents:read", + method="GET", ) - + assert endpoint.method == "GET" def test_stream_endpoint_initialization_default_method(self): - """Test StreamEndpoint initialization with default POST method""" + """Test StreamEndpoint initialization with default POST method. + The method default is cosmetic; the capability is not + defaulted — it is always required.""" mock_auth = MagicMock() mock_dispatcher = MagicMock() - + endpoint = StreamEndpoint( endpoint_path="/api/stream", auth=mock_auth, - dispatcher=mock_dispatcher + dispatcher=mock_dispatcher, + capability="documents:write", ) - + assert endpoint.method == "POST" # Default value @pytest.mark.asyncio @@ -61,9 +70,12 @@ class TestStreamEndpoint: """Test StreamEndpoint start method (should be no-op)""" mock_auth = MagicMock() mock_dispatcher = MagicMock() - - endpoint = StreamEndpoint("/api/stream", mock_auth, mock_dispatcher) - + + endpoint = StreamEndpoint( + "/api/stream", mock_auth, mock_dispatcher, + capability="documents:write", + ) + # start() should complete without error await endpoint.start() @@ -72,16 +84,17 @@ class TestStreamEndpoint: mock_auth = MagicMock() mock_dispatcher = MagicMock() mock_app = MagicMock() - + endpoint = StreamEndpoint( endpoint_path="/api/stream", auth=mock_auth, dispatcher=mock_dispatcher, - method="POST" + capability="documents:write", + method="POST", ) - + endpoint.add_routes(mock_app) - + # Verify add_routes was called with POST route mock_app.add_routes.assert_called_once() call_args = mock_app.add_routes.call_args[0][0] @@ -92,16 +105,17 @@ class TestStreamEndpoint: mock_auth = MagicMock() mock_dispatcher = MagicMock() mock_app = MagicMock() - + endpoint = StreamEndpoint( endpoint_path="/api/stream", auth=mock_auth, dispatcher=mock_dispatcher, - method="GET" + capability="documents:read", + method="GET", ) - + endpoint.add_routes(mock_app) - + # Verify add_routes was called with GET route mock_app.add_routes.assert_called_once() call_args = mock_app.add_routes.call_args[0][0] @@ -112,13 +126,14 @@ class TestStreamEndpoint: mock_auth = MagicMock() mock_dispatcher = MagicMock() mock_app = MagicMock() - + endpoint = StreamEndpoint( endpoint_path="/api/stream", auth=mock_auth, dispatcher=mock_dispatcher, - method="INVALID" + capability="documents:write", + method="INVALID", ) - + with pytest.raises(RuntimeError, match="Bad method"): endpoint.add_routes(mock_app) \ No newline at end of file diff --git a/tests/unit/test_gateway/test_endpoint_variable.py b/tests/unit/test_gateway/test_endpoint_variable.py index ffaf4e9a..1cdc8f9f 100644 --- a/tests/unit/test_gateway/test_endpoint_variable.py +++ b/tests/unit/test_gateway/test_endpoint_variable.py @@ -12,29 +12,36 @@ class TestVariableEndpoint: """Test cases for VariableEndpoint class""" def test_variable_endpoint_initialization(self): - """Test VariableEndpoint initialization""" + """Construction records the configured capability on the + instance. VariableEndpoint is used in production for the + /api/v1/{kind} admin-scoped global service routes, so a + write-side capability is a realistic value for the test.""" mock_auth = MagicMock() mock_dispatcher = MagicMock() - + endpoint = VariableEndpoint( endpoint_path="/api/variable", auth=mock_auth, - dispatcher=mock_dispatcher + dispatcher=mock_dispatcher, + capability="config:write", ) - + assert endpoint.path == "/api/variable" assert endpoint.auth == mock_auth assert endpoint.dispatcher == mock_dispatcher - assert endpoint.operation == "service" + assert endpoint.capability == "config:write" @pytest.mark.asyncio async def test_variable_endpoint_start_method(self): """Test VariableEndpoint start method (should be no-op)""" mock_auth = MagicMock() mock_dispatcher = MagicMock() - - endpoint = VariableEndpoint("/api/var", mock_auth, mock_dispatcher) - + + endpoint = VariableEndpoint( + "/api/var", mock_auth, mock_dispatcher, + capability="config:write", + ) + # start() should complete without error await endpoint.start() @@ -43,10 +50,13 @@ class TestVariableEndpoint: mock_auth = MagicMock() mock_dispatcher = MagicMock() mock_app = MagicMock() - - endpoint = VariableEndpoint("/api/variable", mock_auth, mock_dispatcher) + + endpoint = VariableEndpoint( + "/api/variable", mock_auth, mock_dispatcher, + capability="config:write", + ) endpoint.add_routes(mock_app) - + # Verify add_routes was called with POST route mock_app.add_routes.assert_called_once() call_args = mock_app.add_routes.call_args[0][0] diff --git a/tests/unit/test_gateway/test_service.py b/tests/unit/test_gateway/test_service.py index 71428db4..107e6819 100644 --- a/tests/unit/test_gateway/test_service.py +++ b/tests/unit/test_gateway/test_service.py @@ -1,355 +1,179 @@ """ -Tests for Gateway Service API +Tests for gateway/service.py — the Api class that wires together +the pub/sub backend, IAM auth, config receiver, dispatcher manager, +and endpoint manager. + +The legacy ``GATEWAY_SECRET`` / ``default_api_token`` / allow-all +surface is gone, so the tests here focus on the Api's construction +and composition rather than the removed auth behaviour. IamAuth's +own behaviour is covered in test_auth.py. """ import pytest -import asyncio -from unittest.mock import Mock, patch, MagicMock, AsyncMock +from unittest.mock import AsyncMock, Mock, patch from aiohttp import web -import pulsar -from trustgraph.gateway.service import Api, run, default_pulsar_host, default_prometheus_url, default_timeout, default_port, default_api_token - -# Tests for Gateway Service API +from trustgraph.gateway.service import ( + Api, + default_pulsar_host, default_prometheus_url, + default_timeout, default_port, +) +from trustgraph.gateway.auth import IamAuth -class TestApi: - """Test cases for Api class""" - +# -- constants ------------------------------------------------------------- - def test_api_initialization_with_defaults(self): - """Test Api initialization with default values""" - with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub: - mock_backend = Mock() - mock_get_pubsub.return_value = mock_backend - api = Api() +class TestDefaults: - assert api.port == default_port - assert api.timeout == default_timeout - assert api.pulsar_host == default_pulsar_host - assert api.pulsar_api_key is None - assert api.prometheus_url == default_prometheus_url + "/" - assert api.auth.allow_all is True + def test_exports_default_constants(self): + # These are consumed by CLIs / tests / docs. Sanity-check + # that they're the expected shape. + assert default_port == 8088 + assert default_timeout == 600 + assert default_pulsar_host.startswith("pulsar://") + assert default_prometheus_url.startswith("http") - # Verify get_pubsub was called - mock_get_pubsub.assert_called_once() - def test_api_initialization_with_custom_config(self): - """Test Api initialization with custom configuration""" +# -- Api construction ------------------------------------------------------ + + +@pytest.fixture +def mock_backend(): + return Mock() + + +@pytest.fixture +def api(mock_backend): + with patch( + "trustgraph.gateway.service.get_pubsub", + return_value=mock_backend, + ): + yield Api() + + +class TestApiConstruction: + + def test_defaults(self, api): + assert api.port == default_port + assert api.timeout == default_timeout + assert api.pulsar_host == default_pulsar_host + assert api.pulsar_api_key is None + # prometheus_url gets normalised with a trailing slash + assert api.prometheus_url == default_prometheus_url + "/" + + def test_auth_is_iam_backed(self, api): + # Any Api always gets an IamAuth. There is no "no auth" mode + # (GATEWAY_SECRET / allow_all has been removed — see IAM spec). + assert isinstance(api.auth, IamAuth) + + def test_components_wired(self, api): + assert api.config_receiver is not None + assert api.dispatcher_manager is not None + assert api.endpoint_manager is not None + + def test_dispatcher_manager_has_auth(self, api): + # The Mux uses this handle for first-frame socket auth. + assert api.dispatcher_manager.auth is api.auth + + def test_custom_config(self, mock_backend): config = { "port": 9000, "timeout": 300, "pulsar_host": "pulsar://custom-host:6650", - "pulsar_api_key": "test-api-key", - "pulsar_listener": "custom-listener", + "pulsar_api_key": "custom-key", "prometheus_url": "http://custom-prometheus:9090", - "api_token": "secret-token" } + with patch( + "trustgraph.gateway.service.get_pubsub", + return_value=mock_backend, + ): + a = Api(**config) - with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub: - mock_backend = Mock() - mock_get_pubsub.return_value = mock_backend + assert a.port == 9000 + assert a.timeout == 300 + assert a.pulsar_host == "pulsar://custom-host:6650" + assert a.pulsar_api_key == "custom-key" + # Trailing slash added. + assert a.prometheus_url == "http://custom-prometheus:9090/" - api = Api(**config) + def test_prometheus_url_already_has_trailing_slash(self, mock_backend): + with patch( + "trustgraph.gateway.service.get_pubsub", + return_value=mock_backend, + ): + a = Api(prometheus_url="http://p:9090/") + assert a.prometheus_url == "http://p:9090/" - assert api.port == 9000 - assert api.timeout == 300 - assert api.pulsar_host == "pulsar://custom-host:6650" - assert api.pulsar_api_key == "test-api-key" - assert api.prometheus_url == "http://custom-prometheus:9090/" - assert api.auth.token == "secret-token" - assert api.auth.allow_all is False + def test_queue_overrides_parsed_for_config(self, mock_backend): + with patch( + "trustgraph.gateway.service.get_pubsub", + return_value=mock_backend, + ): + a = Api( + config_request_queue="alt-config-req", + config_response_queue="alt-config-resp", + ) + overrides = a.dispatcher_manager.queue_overrides + assert overrides.get("config", {}).get("request") == "alt-config-req" + assert overrides.get("config", {}).get("response") == "alt-config-resp" - # Verify get_pubsub was called with config - mock_get_pubsub.assert_called_once_with(**config) - def test_api_initialization_with_pulsar_api_key(self): - """Test Api initialization with Pulsar API key authentication""" - with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub: - mock_get_pubsub.return_value = Mock() +# -- app_factory ----------------------------------------------------------- - api = Api(pulsar_api_key="test-key") - # Verify api key was stored - assert api.pulsar_api_key == "test-key" - mock_get_pubsub.assert_called_once() - - def test_api_initialization_prometheus_url_normalization(self): - """Test that prometheus_url gets normalized with trailing slash""" - with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub: - mock_get_pubsub.return_value = Mock() - - # Test URL without trailing slash - api = Api(prometheus_url="http://prometheus:9090") - assert api.prometheus_url == "http://prometheus:9090/" - - # Test URL with trailing slash - api = Api(prometheus_url="http://prometheus:9090/") - assert api.prometheus_url == "http://prometheus:9090/" - - def test_api_initialization_empty_api_token_means_no_auth(self): - """Test that empty API token results in allow_all authentication""" - with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub: - mock_get_pubsub.return_value = Mock() - - api = Api(api_token="") - assert api.auth.allow_all is True - - def test_api_initialization_none_api_token_means_no_auth(self): - """Test that None API token results in allow_all authentication""" - with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub: - mock_get_pubsub.return_value = Mock() - - api = Api(api_token=None) - assert api.auth.allow_all is True +class TestAppFactory: @pytest.mark.asyncio - async def test_app_factory_creates_application(self): - """Test that app_factory creates aiohttp application""" - with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub: - mock_get_pubsub.return_value = Mock() - - api = Api() - - # Mock the dependencies - api.config_receiver = Mock() - api.config_receiver.start = AsyncMock() - api.endpoint_manager = Mock() - api.endpoint_manager.add_routes = Mock() - api.endpoint_manager.start = AsyncMock() - - app = await api.app_factory() - - assert isinstance(app, web.Application) - assert app._client_max_size == 256 * 1024 * 1024 - - # Verify that config receiver was started - api.config_receiver.start.assert_called_once() - - # Verify that endpoint manager was configured - api.endpoint_manager.add_routes.assert_called_once_with(app) - api.endpoint_manager.start.assert_called_once() + async def test_creates_aiohttp_app(self, api): + # Stub out the long-tail dependencies that reach out to IAM / + # pub/sub so we can exercise the factory in isolation. + api.auth.start = AsyncMock() + api.config_receiver = Mock() + api.config_receiver.start = AsyncMock() + api.endpoint_manager = Mock() + api.endpoint_manager.add_routes = Mock() + api.endpoint_manager.start = AsyncMock() + api.endpoints = [] + + app = await api.app_factory() + + assert isinstance(app, web.Application) + assert app._client_max_size == 256 * 1024 * 1024 + api.auth.start.assert_called_once() + api.config_receiver.start.assert_called_once() + api.endpoint_manager.add_routes.assert_called_once_with(app) + api.endpoint_manager.start.assert_called_once() @pytest.mark.asyncio - async def test_app_factory_with_custom_endpoints(self): - """Test app_factory with custom endpoints""" - with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub: - mock_get_pubsub.return_value = Mock() - - api = Api() - - # Mock custom endpoints - mock_endpoint1 = Mock() - mock_endpoint1.add_routes = Mock() - mock_endpoint1.start = AsyncMock() - - mock_endpoint2 = Mock() - mock_endpoint2.add_routes = Mock() - mock_endpoint2.start = AsyncMock() - - api.endpoints = [mock_endpoint1, mock_endpoint2] - - # Mock the dependencies - api.config_receiver = Mock() - api.config_receiver.start = AsyncMock() - api.endpoint_manager = Mock() - api.endpoint_manager.add_routes = Mock() - api.endpoint_manager.start = AsyncMock() - - app = await api.app_factory() - - # Verify custom endpoints were configured - mock_endpoint1.add_routes.assert_called_once_with(app) - mock_endpoint1.start.assert_called_once() - mock_endpoint2.add_routes.assert_called_once_with(app) - mock_endpoint2.start.assert_called_once() + async def test_auth_start_runs_before_accepting_traffic(self, api): + """``auth.start()`` fetches the IAM signing key, and must + complete (or time out) before the gateway begins accepting + requests. It's the first await in app_factory.""" + order = [] - def test_run_method_calls_web_run_app(self): - """Test that run method calls web.run_app""" - with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub, \ - patch('aiohttp.web.run_app') as mock_run_app: - mock_get_pubsub.return_value = Mock() + # AsyncMock.side_effect expects a sync callable (its return + # value becomes the coroutine's return); a plain list.append + # avoids the "coroutine was never awaited" trap of an async + # side_effect. + api.auth.start = AsyncMock( + side_effect=lambda: order.append("auth"), + ) + api.config_receiver = Mock() + api.config_receiver.start = AsyncMock( + side_effect=lambda: order.append("config"), + ) + api.endpoint_manager = Mock() + api.endpoint_manager.add_routes = Mock() + api.endpoint_manager.start = AsyncMock( + side_effect=lambda: order.append("endpoints"), + ) + api.endpoints = [] - # Api.run() passes self.app_factory() — a coroutine — to - # web.run_app, which would normally consume it inside its own - # event loop. Since we mock run_app, close the coroutine here - # so it doesn't leak as an "unawaited coroutine" RuntimeWarning. - def _consume_coro(coro, **kwargs): - coro.close() - mock_run_app.side_effect = _consume_coro + await api.app_factory() - api = Api(port=8080) - api.run() - - # Verify run_app was called once with the correct port - mock_run_app.assert_called_once() - args, kwargs = mock_run_app.call_args - assert len(args) == 1 # Should have one positional arg (the coroutine) - assert kwargs == {'port': 8080} # Should have port keyword arg - - def test_api_components_initialization(self): - """Test that all API components are properly initialized""" - with patch('trustgraph.gateway.service.get_pubsub') as mock_get_pubsub: - mock_get_pubsub.return_value = Mock() - - api = Api() - - # Verify all components are initialized - assert api.config_receiver is not None - assert api.dispatcher_manager is not None - assert api.endpoint_manager is not None - assert api.endpoints == [] - - # Verify component relationships - assert api.dispatcher_manager.backend == api.pubsub_backend - assert api.dispatcher_manager.config_receiver == api.config_receiver - assert api.endpoint_manager.dispatcher_manager == api.dispatcher_manager - # EndpointManager doesn't store auth directly, it passes it to individual endpoints - - -class TestRunFunction: - """Test cases for the run() function""" - - def test_run_function_with_metrics_enabled(self): - """Test run function with metrics enabled""" - import warnings - # Suppress the specific async warning with a broader pattern - warnings.filterwarnings("ignore", message=".*Api.app_factory.*was never awaited", category=RuntimeWarning) - - with patch('argparse.ArgumentParser.parse_args') as mock_parse_args, \ - patch('trustgraph.gateway.service.start_http_server') as mock_start_http_server: - - # Mock command line arguments - mock_args = Mock() - mock_args.metrics = True - mock_args.metrics_port = 8000 - mock_parse_args.return_value = mock_args - - # Create a simple mock instance without any async methods - mock_api_instance = Mock() - mock_api_instance.run = Mock() - - # Create a mock Api class without importing the real one - mock_api = Mock(return_value=mock_api_instance) - - # Patch using context manager to avoid importing the real Api class - with patch('trustgraph.gateway.service.Api', mock_api): - # Mock vars() to return a dict - with patch('builtins.vars') as mock_vars: - mock_vars.return_value = { - 'metrics': True, - 'metrics_port': 8000, - 'pulsar_host': default_pulsar_host, - 'timeout': default_timeout - } - - run() - - # Verify metrics server was started - mock_start_http_server.assert_called_once_with(8000) - - # Verify Api was created and run was called - mock_api.assert_called_once() - mock_api_instance.run.assert_called_once() - - @patch('trustgraph.gateway.service.start_http_server') - @patch('argparse.ArgumentParser.parse_args') - def test_run_function_with_metrics_disabled(self, mock_parse_args, mock_start_http_server): - """Test run function with metrics disabled""" - # Mock command line arguments - mock_args = Mock() - mock_args.metrics = False - mock_parse_args.return_value = mock_args - - # Create a simple mock instance without any async methods - mock_api_instance = Mock() - mock_api_instance.run = Mock() - - # Patch the Api class inside the test without using decorators - with patch('trustgraph.gateway.service.Api') as mock_api: - mock_api.return_value = mock_api_instance - - # Mock vars() to return a dict - with patch('builtins.vars') as mock_vars: - mock_vars.return_value = { - 'metrics': False, - 'metrics_port': 8000, - 'pulsar_host': default_pulsar_host, - 'timeout': default_timeout - } - - run() - - # Verify metrics server was NOT started - mock_start_http_server.assert_not_called() - - # Verify Api was created and run was called - mock_api.assert_called_once() - mock_api_instance.run.assert_called_once() - - @patch('argparse.ArgumentParser.parse_args') - def test_run_function_argument_parsing(self, mock_parse_args): - """Test that run function properly parses command line arguments""" - # Mock command line arguments - mock_args = Mock() - mock_args.metrics = False - mock_parse_args.return_value = mock_args - - # Create a simple mock instance without any async methods - mock_api_instance = Mock() - mock_api_instance.run = Mock() - - # Mock vars() to return a dict with all expected arguments - expected_args = { - 'pulsar_host': 'pulsar://test:6650', - 'pulsar_api_key': 'test-key', - 'pulsar_listener': 'test-listener', - 'prometheus_url': 'http://test-prometheus:9090', - 'port': 9000, - 'timeout': 300, - 'api_token': 'secret', - 'log_level': 'INFO', - 'metrics': False, - 'metrics_port': 8001 - } - - # Patch the Api class inside the test without using decorators - with patch('trustgraph.gateway.service.Api') as mock_api: - mock_api.return_value = mock_api_instance - - with patch('builtins.vars') as mock_vars: - mock_vars.return_value = expected_args - - run() - - # Verify Api was created with the parsed arguments - mock_api.assert_called_once_with(**expected_args) - mock_api_instance.run.assert_called_once() - - def test_run_function_creates_argument_parser(self): - """Test that run function creates argument parser with correct arguments""" - with patch('argparse.ArgumentParser') as mock_parser_class: - mock_parser = Mock() - mock_parser_class.return_value = mock_parser - mock_parser.parse_args.return_value = Mock(metrics=False) - - with patch('trustgraph.gateway.service.Api') as mock_api, \ - patch('builtins.vars') as mock_vars: - mock_vars.return_value = {'metrics': False} - mock_api.return_value = Mock() - - run() - - # Verify ArgumentParser was created - mock_parser_class.assert_called_once() - - # Verify add_argument was called for each expected argument - expected_arguments = [ - 'pulsar-host', 'pulsar-api-key', 'pulsar-listener', - 'prometheus-url', 'port', 'timeout', 'api-token', - 'log-level', 'metrics', 'metrics-port' - ] - - # Check that add_argument was called multiple times (once for each arg) - assert mock_parser.add_argument.call_count >= len(expected_arguments) \ No newline at end of file + # auth.start must be first (before config receiver, before + # any endpoint starts). + assert order[0] == "auth" + # All three must have run. + assert set(order) == {"auth", "config", "endpoints"} diff --git a/tests/unit/test_gateway/test_socket_graceful_shutdown.py b/tests/unit/test_gateway/test_socket_graceful_shutdown.py index 1a63227d..6c3e323b 100644 --- a/tests/unit/test_gateway/test_socket_graceful_shutdown.py +++ b/tests/unit/test_gateway/test_socket_graceful_shutdown.py @@ -1,4 +1,15 @@ -"""Unit tests for SocketEndpoint graceful shutdown functionality.""" +"""Unit tests for SocketEndpoint graceful shutdown functionality. + +These tests exercise SocketEndpoint in its handshake-auth +configuration (``in_band_auth=False``) — the mode used in production +for the flow import/export streaming endpoints. The mux socket at +``/api/v1/socket`` uses ``in_band_auth=True`` instead, where the +handshake always accepts and authentication runs on the first +WebSocket frame; that path is covered by the Mux tests. + +Every endpoint constructor here passes an explicit capability — no +permissive default is relied upon. +""" import pytest import asyncio @@ -6,13 +17,32 @@ from unittest.mock import AsyncMock, MagicMock, patch from aiohttp import web, WSMsgType from trustgraph.gateway.endpoint.socket import SocketEndpoint from trustgraph.gateway.running import Running +from trustgraph.gateway.auth import Identity + + +# Representative capability used across these tests — corresponds to +# the flow-import streaming endpoint pattern that uses this class. +TEST_CAP = "graph:write" + + +def _valid_identity(): + return Identity( + handle="test-user", + workspace="default", + principal_id="test-user", + source="api-key", + ) @pytest.fixture def mock_auth(): - """Mock authentication service.""" + """Mock IAM-backed authenticator. Successful by default — + ``authenticate`` returns a valid identity and ``authorise`` + allows everything. Tests that need the failure paths override + the relevant attribute locally.""" auth = MagicMock() - auth.permitted.return_value = True + auth.authenticate = AsyncMock(return_value=_valid_identity()) + auth.authorise = AsyncMock(return_value=None) return auth @@ -25,7 +55,7 @@ def mock_dispatcher_factory(): dispatcher.receive = AsyncMock() dispatcher.destroy = AsyncMock() return dispatcher - + return dispatcher_factory @@ -35,7 +65,8 @@ def socket_endpoint(mock_auth, mock_dispatcher_factory): return SocketEndpoint( endpoint_path="/test-socket", auth=mock_auth, - dispatcher=mock_dispatcher_factory + dispatcher=mock_dispatcher_factory, + capability=TEST_CAP, ) @@ -61,7 +92,10 @@ def mock_request(): @pytest.mark.asyncio async def test_listener_graceful_shutdown_on_close(): """Test listener handles websocket close gracefully.""" - socket_endpoint = SocketEndpoint("/test", MagicMock(), AsyncMock()) + socket_endpoint = SocketEndpoint( + "/test", MagicMock(), AsyncMock(), + capability=TEST_CAP, + ) # Mock websocket that closes after one message ws = AsyncMock() @@ -99,9 +133,10 @@ async def test_listener_graceful_shutdown_on_close(): @pytest.mark.asyncio async def test_handle_normal_flow(): - """Test normal websocket handling flow.""" + """Valid bearer → handshake accepted, dispatcher created.""" mock_auth = MagicMock() - mock_auth.permitted.return_value = True + mock_auth.authenticate = AsyncMock(return_value=_valid_identity()) + mock_auth.authorise = AsyncMock(return_value=None) dispatcher_created = False async def mock_dispatcher_factory(ws, running, match_info): @@ -111,7 +146,10 @@ async def test_handle_normal_flow(): dispatcher.destroy = AsyncMock() return dispatcher - socket_endpoint = SocketEndpoint("/test", mock_auth, mock_dispatcher_factory) + socket_endpoint = SocketEndpoint( + "/test", mock_auth, mock_dispatcher_factory, + capability=TEST_CAP, + ) request = MagicMock() request.query = {"token": "valid-token"} @@ -155,7 +193,8 @@ async def test_handle_normal_flow(): async def test_handle_exception_group_cleanup(): """Test exception group triggers dispatcher cleanup.""" mock_auth = MagicMock() - mock_auth.permitted.return_value = True + mock_auth.authenticate = AsyncMock(return_value=_valid_identity()) + mock_auth.authorise = AsyncMock(return_value=None) mock_dispatcher = AsyncMock() mock_dispatcher.destroy = AsyncMock() @@ -163,7 +202,10 @@ async def test_handle_exception_group_cleanup(): async def mock_dispatcher_factory(ws, running, match_info): return mock_dispatcher - socket_endpoint = SocketEndpoint("/test", mock_auth, mock_dispatcher_factory) + socket_endpoint = SocketEndpoint( + "/test", mock_auth, mock_dispatcher_factory, + capability=TEST_CAP, + ) request = MagicMock() request.query = {"token": "valid-token"} @@ -222,7 +264,8 @@ async def test_handle_exception_group_cleanup(): async def test_handle_dispatcher_cleanup_timeout(): """Test dispatcher cleanup with timeout.""" mock_auth = MagicMock() - mock_auth.permitted.return_value = True + mock_auth.authenticate = AsyncMock(return_value=_valid_identity()) + mock_auth.authorise = AsyncMock(return_value=None) # Mock dispatcher that takes long to destroy mock_dispatcher = AsyncMock() @@ -231,7 +274,10 @@ async def test_handle_dispatcher_cleanup_timeout(): async def mock_dispatcher_factory(ws, running, match_info): return mock_dispatcher - socket_endpoint = SocketEndpoint("/test", mock_auth, mock_dispatcher_factory) + socket_endpoint = SocketEndpoint( + "/test", mock_auth, mock_dispatcher_factory, + capability=TEST_CAP, + ) request = MagicMock() request.query = {"token": "valid-token"} @@ -285,49 +331,68 @@ async def test_handle_dispatcher_cleanup_timeout(): @pytest.mark.asyncio async def test_handle_unauthorized_request(): - """Test handling of unauthorized requests.""" + """A bearer that the IAM layer rejects causes the handshake to + fail with 401. IamAuth surfaces an HTTPUnauthorized; the + endpoint propagates it. Note that the endpoint intentionally + does NOT distinguish 'bad token', 'expired', 'revoked', etc. — + that's the IAM error-masking policy.""" mock_auth = MagicMock() - mock_auth.permitted.return_value = False # Unauthorized - - socket_endpoint = SocketEndpoint("/test", mock_auth, AsyncMock()) - + mock_auth.authenticate = AsyncMock(side_effect=web.HTTPUnauthorized( + text='{"error":"auth failure"}', + content_type="application/json", + )) + + socket_endpoint = SocketEndpoint( + "/test", mock_auth, AsyncMock(), + capability=TEST_CAP, + ) + request = MagicMock() request.query = {"token": "invalid-token"} - + result = await socket_endpoint.handle(request) - - # Should return HTTP 401 + assert isinstance(result, web.HTTPUnauthorized) - - # Should have checked permission - mock_auth.permitted.assert_called_once_with("invalid-token", "socket") + # authenticate must have been invoked with a synthetic request + # carrying Bearer . The endpoint wraps the query- + # string token into an Authorization header for a uniform auth + # path — the IAM layer does not look at query strings directly. + mock_auth.authenticate.assert_called_once() + passed_req = mock_auth.authenticate.call_args.args[0] + assert passed_req.headers["Authorization"] == "Bearer invalid-token" @pytest.mark.asyncio async def test_handle_missing_token(): - """Test handling of requests with missing token.""" + """Request with no ``token`` query param → 401 before any + IAM call is made (cheap short-circuit).""" mock_auth = MagicMock() - mock_auth.permitted.return_value = False - - socket_endpoint = SocketEndpoint("/test", mock_auth, AsyncMock()) - + mock_auth.authenticate = AsyncMock( + side_effect=AssertionError( + "authenticate must not be invoked when no token is present" + ), + ) + + socket_endpoint = SocketEndpoint( + "/test", mock_auth, AsyncMock(), + capability=TEST_CAP, + ) + request = MagicMock() request.query = {} # No token - + result = await socket_endpoint.handle(request) - - # Should return HTTP 401 + assert isinstance(result, web.HTTPUnauthorized) - - # Should have checked permission with empty token - mock_auth.permitted.assert_called_once_with("", "socket") + mock_auth.authenticate.assert_not_called() @pytest.mark.asyncio async def test_handle_websocket_already_closed(): """Test handling when websocket is already closed.""" mock_auth = MagicMock() - mock_auth.permitted.return_value = True + mock_auth.authenticate = AsyncMock(return_value=_valid_identity()) + mock_auth.authorise = AsyncMock(return_value=None) mock_dispatcher = AsyncMock() mock_dispatcher.destroy = AsyncMock() @@ -335,7 +400,10 @@ async def test_handle_websocket_already_closed(): async def mock_dispatcher_factory(ws, running, match_info): return mock_dispatcher - socket_endpoint = SocketEndpoint("/test", mock_auth, mock_dispatcher_factory) + socket_endpoint = SocketEndpoint( + "/test", mock_auth, mock_dispatcher_factory, + capability=TEST_CAP, + ) request = MagicMock() request.query = {"token": "valid-token"} diff --git a/tests/unit/test_text_completion/test_ollama_processor.py b/tests/unit/test_text_completion/test_ollama_processor.py index 69baf85f..35bf182a 100644 --- a/tests/unit/test_text_completion/test_ollama_processor.py +++ b/tests/unit/test_text_completion/test_ollama_processor.py @@ -15,13 +15,13 @@ from trustgraph.base import LlmResult class TestOllamaProcessorSimple(IsolatedAsyncioTestCase): """Test Ollama processor functionality""" - @patch('trustgraph.model.text_completion.ollama.llm.Client') + @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.llm_service.LlmService.__init__') async def test_processor_initialization_basic(self, mock_llm_init, mock_async_init, mock_client_class): """Test basic processor initialization""" # Arrange - mock_client = MagicMock() + mock_client = AsyncMock() mock_client_class.return_value = mock_client # Mock the parent class initialization @@ -44,13 +44,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase): assert hasattr(processor, 'llm') mock_client_class.assert_called_once_with(host='http://localhost:11434') - @patch('trustgraph.model.text_completion.ollama.llm.Client') + @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.llm_service.LlmService.__init__') async def test_generate_content_success(self, mock_llm_init, mock_async_init, mock_client_class): """Test successful content generation""" # Arrange - mock_client = MagicMock() + mock_client = AsyncMock() mock_response = { 'response': 'Generated response from Ollama', 'prompt_eval_count': 15, @@ -83,13 +83,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase): assert result.model == 'llama2' mock_client.generate.assert_called_once_with('llama2', "System prompt\n\nUser prompt", options={'temperature': 0.0}) - @patch('trustgraph.model.text_completion.ollama.llm.Client') + @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.llm_service.LlmService.__init__') async def test_generate_content_generic_exception(self, mock_llm_init, mock_async_init, mock_client_class): """Test handling of generic exceptions""" # Arrange - mock_client = MagicMock() + mock_client = AsyncMock() mock_client.generate.side_effect = Exception("Connection error") mock_client_class.return_value = mock_client @@ -110,13 +110,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase): with pytest.raises(Exception, match="Connection error"): await processor.generate_content("System prompt", "User prompt") - @patch('trustgraph.model.text_completion.ollama.llm.Client') + @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.llm_service.LlmService.__init__') async def test_processor_initialization_with_custom_parameters(self, mock_llm_init, mock_async_init, mock_client_class): """Test processor initialization with custom parameters""" # Arrange - mock_client = MagicMock() + mock_client = AsyncMock() mock_client_class.return_value = mock_client mock_async_init.return_value = None @@ -137,13 +137,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase): assert processor.default_model == 'mistral' mock_client_class.assert_called_once_with(host='http://192.168.1.100:11434') - @patch('trustgraph.model.text_completion.ollama.llm.Client') + @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.llm_service.LlmService.__init__') async def test_processor_initialization_with_defaults(self, mock_llm_init, mock_async_init, mock_client_class): """Test processor initialization with default values""" # Arrange - mock_client = MagicMock() + mock_client = AsyncMock() mock_client_class.return_value = mock_client mock_async_init.return_value = None @@ -164,13 +164,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase): # Should use default_ollama (http://localhost:11434 or from OLLAMA_HOST env) mock_client_class.assert_called_once() - @patch('trustgraph.model.text_completion.ollama.llm.Client') + @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.llm_service.LlmService.__init__') async def test_generate_content_empty_prompts(self, mock_llm_init, mock_async_init, mock_client_class): """Test content generation with empty prompts""" # Arrange - mock_client = MagicMock() + mock_client = AsyncMock() mock_response = { 'response': 'Default response', 'prompt_eval_count': 2, @@ -205,13 +205,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase): # The prompt should be "" + "\n\n" + "" = "\n\n" mock_client.generate.assert_called_once_with('llama2', "\n\n", options={'temperature': 0.0}) - @patch('trustgraph.model.text_completion.ollama.llm.Client') + @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.llm_service.LlmService.__init__') async def test_generate_content_token_counting(self, mock_llm_init, mock_async_init, mock_client_class): """Test token counting from Ollama response""" # Arrange - mock_client = MagicMock() + mock_client = AsyncMock() mock_response = { 'response': 'Test response', 'prompt_eval_count': 50, @@ -243,13 +243,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase): assert result.out_token == 25 assert result.model == 'llama2' - @patch('trustgraph.model.text_completion.ollama.llm.Client') + @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.llm_service.LlmService.__init__') async def test_ollama_client_initialization(self, mock_llm_init, mock_async_init, mock_client_class): """Test that Ollama client is initialized correctly""" # Arrange - mock_client = MagicMock() + mock_client = AsyncMock() mock_client_class.return_value = mock_client mock_async_init.return_value = None @@ -273,13 +273,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase): # Verify processor has the client assert processor.llm == mock_client - @patch('trustgraph.model.text_completion.ollama.llm.Client') + @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.llm_service.LlmService.__init__') async def test_generate_content_prompt_construction(self, mock_llm_init, mock_async_init, mock_client_class): """Test prompt construction with system and user prompts""" # Arrange - mock_client = MagicMock() + mock_client = AsyncMock() mock_response = { 'response': 'Response with system instructions', 'prompt_eval_count': 25, @@ -312,13 +312,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase): # Verify the combined prompt mock_client.generate.assert_called_once_with('llama2', "You are a helpful assistant\n\nWhat is AI?", options={'temperature': 0.0}) - @patch('trustgraph.model.text_completion.ollama.llm.Client') + @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.llm_service.LlmService.__init__') async def test_generate_content_temperature_override(self, mock_llm_init, mock_async_init, mock_client_class): """Test temperature parameter override functionality""" # Arrange - mock_client = MagicMock() + mock_client = AsyncMock() mock_response = { 'response': 'Response with custom temperature', 'prompt_eval_count': 20, @@ -360,13 +360,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase): options={'temperature': 0.8} # Should use runtime override ) - @patch('trustgraph.model.text_completion.ollama.llm.Client') + @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.llm_service.LlmService.__init__') async def test_generate_content_model_override(self, mock_llm_init, mock_async_init, mock_client_class): """Test model parameter override functionality""" # Arrange - mock_client = MagicMock() + mock_client = AsyncMock() mock_response = { 'response': 'Response with custom model', 'prompt_eval_count': 18, @@ -408,13 +408,13 @@ class TestOllamaProcessorSimple(IsolatedAsyncioTestCase): options={'temperature': 0.1} # Should use processor default ) - @patch('trustgraph.model.text_completion.ollama.llm.Client') + @patch('trustgraph.model.text_completion.ollama.llm.AsyncClient') @patch('trustgraph.base.async_processor.AsyncProcessor.__init__') @patch('trustgraph.base.llm_service.LlmService.__init__') async def test_generate_content_both_parameters_override(self, mock_llm_init, mock_async_init, mock_client_class): """Test overriding both model and temperature parameters simultaneously""" # Arrange - mock_client = MagicMock() + mock_client = AsyncMock() mock_response = { 'response': 'Response with both overrides', 'prompt_eval_count': 22, diff --git a/trustgraph-base/trustgraph/api/async_socket_client.py b/trustgraph-base/trustgraph/api/async_socket_client.py index e5d553ea..ca9146b9 100644 --- a/trustgraph-base/trustgraph/api/async_socket_client.py +++ b/trustgraph-base/trustgraph/api/async_socket_client.py @@ -49,21 +49,67 @@ class AsyncSocketClient: return f"ws://{url}" def _build_ws_url(self): - ws_url = f"{self.url.rstrip('/')}/api/v1/socket" - if self.token: - ws_url = f"{ws_url}?token={self.token}" - return ws_url + # /api/v1/socket uses the first-frame auth protocol — the + # token is sent as the first frame after connecting rather + # than in the URL. This avoids browser issues with 401 on + # the WebSocket handshake and lets long-lived sockets + # refresh credentials mid-session. + return f"{self.url.rstrip('/')}/api/v1/socket" async def connect(self): - """Establish the persistent websocket connection.""" + """Establish the persistent websocket connection and run the + first-frame auth handshake.""" if self._connected: return + if not self.token: + raise ProtocolException( + "AsyncSocketClient requires a token for first-frame " + "auth against /api/v1/socket" + ) + ws_url = self._build_ws_url() self._connect_cm = websockets.connect( ws_url, ping_interval=20, ping_timeout=self.timeout ) self._socket = await self._connect_cm.__aenter__() + + # First-frame auth: send {"type":"auth","token":"..."} and + # wait for auth-ok / auth-failed. Run before starting the + # reader task so the response isn't consumed by the reader's + # id-based routing. + await self._socket.send(json.dumps({ + "type": "auth", "token": self.token, + })) + try: + raw = await asyncio.wait_for( + self._socket.recv(), timeout=self.timeout, + ) + except asyncio.TimeoutError: + await self._socket.close() + raise ProtocolException("Timeout waiting for auth response") + + try: + resp = json.loads(raw) + except Exception: + await self._socket.close() + raise ProtocolException( + f"Unexpected non-JSON auth response: {raw!r}" + ) + + if resp.get("type") == "auth-ok": + self.workspace = resp.get("workspace", self.workspace) + elif resp.get("type") == "auth-failed": + await self._socket.close() + raise ProtocolException( + f"auth failure: {resp.get('error', 'unknown')}" + ) + else: + await self._socket.close() + raise ProtocolException( + f"Unexpected auth response: {resp!r}" + ) + self._connected = True self._reader_task = asyncio.create_task(self._reader()) diff --git a/trustgraph-base/trustgraph/api/socket_client.py b/trustgraph-base/trustgraph/api/socket_client.py index 4eade3e8..aeb15f85 100644 --- a/trustgraph-base/trustgraph/api/socket_client.py +++ b/trustgraph-base/trustgraph/api/socket_client.py @@ -112,10 +112,10 @@ class SocketClient: return f"ws://{url}" def _build_ws_url(self): - ws_url = f"{self.url.rstrip('/')}/api/v1/socket" - if self.token: - ws_url = f"{ws_url}?token={self.token}" - return ws_url + # /api/v1/socket uses the first-frame auth protocol — the + # token is sent as the first frame after connecting rather + # than in the URL. + return f"{self.url.rstrip('/')}/api/v1/socket" def _get_loop(self): """Get or create the event loop, reusing across calls.""" @@ -132,15 +132,58 @@ class SocketClient: return self._loop async def _ensure_connected(self): - """Lazily establish the persistent websocket connection.""" + """Lazily establish the persistent websocket connection and + run the first-frame auth handshake.""" if self._connected: return + if not self.token: + raise ProtocolException( + "SocketClient requires a token for first-frame auth " + "against /api/v1/socket" + ) + ws_url = self._build_ws_url() self._connect_cm = websockets.connect( ws_url, ping_interval=20, ping_timeout=self.timeout ) self._socket = await self._connect_cm.__aenter__() + + # First-frame auth — run before starting the reader so the + # auth-ok / auth-failed response isn't consumed by the reader + # loop's id-based routing. + await self._socket.send(json.dumps({ + "type": "auth", "token": self.token, + })) + try: + raw = await asyncio.wait_for( + self._socket.recv(), timeout=self.timeout, + ) + except asyncio.TimeoutError: + await self._socket.close() + raise ProtocolException("Timeout waiting for auth response") + + try: + resp = json.loads(raw) + except Exception: + await self._socket.close() + raise ProtocolException( + f"Unexpected non-JSON auth response: {raw!r}" + ) + + if resp.get("type") == "auth-ok": + self.workspace = resp.get("workspace", self.workspace) + elif resp.get("type") == "auth-failed": + await self._socket.close() + raise ProtocolException( + f"auth failure: {resp.get('error', 'unknown')}" + ) + else: + await self._socket.close() + raise ProtocolException( + f"Unexpected auth response: {resp!r}" + ) + self._connected = True self._reader_task = asyncio.create_task(self._reader()) diff --git a/trustgraph-base/trustgraph/base/config_client.py b/trustgraph-base/trustgraph/base/config_client.py index 504a6d58..eb3892f8 100644 --- a/trustgraph-base/trustgraph/base/config_client.py +++ b/trustgraph-base/trustgraph/base/config_client.py @@ -84,6 +84,18 @@ class ConfigClient(RequestResponse): ) return resp.directory + async def get_all(self, workspace, timeout=CONFIG_TIMEOUT): + """Return every config entry in ``workspace`` as a nested dict + ``{type: {key: value}}``. Values are returned as the raw + strings stored by config-svc (typically JSON); callers parse + as needed. An empty dict means the workspace has no config.""" + resp = await self._request( + operation="config", + workspace=workspace, + timeout=timeout, + ) + return resp.config + async def workspaces_for_type(self, type, timeout=CONFIG_TIMEOUT): """Return the set of distinct workspaces with any config of the given type.""" diff --git a/trustgraph-base/trustgraph/base/iam_client.py b/trustgraph-base/trustgraph/base/iam_client.py new file mode 100644 index 00000000..4be59de1 --- /dev/null +++ b/trustgraph-base/trustgraph/base/iam_client.py @@ -0,0 +1,342 @@ + +import json + +from . request_response_spec import RequestResponse, RequestResponseSpec +from .. schema import ( + IamRequest, IamResponse, + UserInput, WorkspaceInput, ApiKeyInput, +) + +IAM_TIMEOUT = 10 + + +class IamClient(RequestResponse): + """Client for the IAM service request/response pub/sub protocol. + + Mirrors ``ConfigClient``: a thin wrapper around ``RequestResponse`` + that knows the IAM request / response schemas. Only the subset of + operations actually implemented by the server today has helper + methods here; callers that need an unimplemented operation can + build ``IamRequest`` and call ``request()`` directly. + """ + + async def _request(self, timeout=IAM_TIMEOUT, **kwargs): + resp = await self.request( + IamRequest(**kwargs), + timeout=timeout, + ) + if resp.error: + raise RuntimeError( + f"{resp.error.type}: {resp.error.message}" + ) + return resp + + async def bootstrap(self, timeout=IAM_TIMEOUT): + """Initial-run IAM self-seed. Returns a tuple of + ``(admin_user_id, admin_api_key_plaintext)``. Both are empty + strings on repeat calls — the operation is a no-op once the + IAM tables are populated.""" + resp = await self._request( + operation="bootstrap", timeout=timeout, + ) + return resp.bootstrap_admin_user_id, resp.bootstrap_admin_api_key + + async def bootstrap_status(self, timeout=IAM_TIMEOUT): + """Returns whether an unconsumed ``bootstrap`` call would + currently succeed (i.e. iam-svc is in ``bootstrap`` mode and + its tables are empty). Side-effect-free; intended for first- + run UX so a UI can decide whether to render setup.""" + resp = await self._request( + operation="bootstrap-status", timeout=timeout, + ) + return resp.bootstrap_available + + async def whoami(self, actor, timeout=IAM_TIMEOUT): + """Return the user record for ``actor`` (the authenticated + caller's handle). AUTHENTICATED-only; no capability check — + every authenticated user can read themselves.""" + resp = await self._request( + operation="whoami", + actor=actor, + timeout=timeout, + ) + return resp.user + + async def resolve_api_key(self, api_key, timeout=IAM_TIMEOUT): + """Resolve a plaintext API key to its identity triple. + + Returns ``(user_id, workspace, roles)`` or raises + ``RuntimeError`` with error type ``auth-failed`` if the key is + unknown / expired / revoked. + + Note: the ``roles`` value is a regime-internal hint and is + not used by the gateway directly under the IAM contract; + all authorisation decisions go through ``authorise()``. + Returned here only for backward compatibility with callers + that haven't migrated.""" + resp = await self._request( + operation="resolve-api-key", + api_key=api_key, + timeout=timeout, + ) + return ( + resp.resolved_user_id, + resp.resolved_workspace, + list(resp.resolved_roles), + ) + + async def authorise(self, identity_handle, capability, + resource, parameters, timeout=IAM_TIMEOUT): + """Ask the IAM regime whether ``identity_handle`` may perform + ``capability`` on ``resource`` given ``parameters``. + + Implements the contract ``authorise(identity, capability, + resource, parameters) → (decision, ttl)``. Returns a tuple + ``(allow: bool, ttl_seconds: int)``. The TTL is the + regime's suggested cache lifetime for this decision; the + gateway honours it (clamped above by gateway-side policy).""" + resp = await self._request( + operation="authorise", + user_id=identity_handle, + capability=capability, + resource_json=json.dumps(resource or {}, sort_keys=True), + parameters_json=json.dumps(parameters or {}, sort_keys=True), + timeout=timeout, + ) + return resp.decision_allow, resp.decision_ttl_seconds + + async def authorise_many(self, identity_handle, checks, + timeout=IAM_TIMEOUT): + """Bulk authorise. ``checks`` is a list of dicts each + carrying ``capability``, ``resource``, and ``parameters``. + Returns a list of ``(allow, ttl)`` tuples in the same order.""" + resp = await self._request( + operation="authorise-many", + user_id=identity_handle, + authorise_checks=json.dumps(list(checks), sort_keys=True), + timeout=timeout, + ) + decisions = json.loads(resp.decisions_json or "[]") + return [(d.get("allow", False), d.get("ttl", 0)) for d in decisions] + + async def create_user(self, workspace, user, actor="", + timeout=IAM_TIMEOUT): + """Create a user. ``user`` is a ``UserInput``.""" + resp = await self._request( + operation="create-user", + workspace=workspace, + actor=actor, + user=user, + timeout=timeout, + ) + return resp.user + + async def list_users(self, workspace, actor="", timeout=IAM_TIMEOUT): + resp = await self._request( + operation="list-users", + workspace=workspace, + actor=actor, + timeout=timeout, + ) + return list(resp.users) + + async def create_api_key(self, workspace, key, actor="", + timeout=IAM_TIMEOUT): + """Create an API key. ``key`` is an ``ApiKeyInput``. Returns + ``(plaintext, record)`` — plaintext is returned once and the + caller is responsible for surfacing it to the operator.""" + resp = await self._request( + operation="create-api-key", + workspace=workspace, + actor=actor, + key=key, + timeout=timeout, + ) + return resp.api_key_plaintext, resp.api_key + + async def list_api_keys(self, workspace, user_id, actor="", + timeout=IAM_TIMEOUT): + resp = await self._request( + operation="list-api-keys", + workspace=workspace, + actor=actor, + user_id=user_id, + timeout=timeout, + ) + return list(resp.api_keys) + + async def revoke_api_key(self, workspace, key_id, actor="", + timeout=IAM_TIMEOUT): + await self._request( + operation="revoke-api-key", + workspace=workspace, + actor=actor, + key_id=key_id, + timeout=timeout, + ) + + async def login(self, username, password, workspace="", + timeout=IAM_TIMEOUT): + """Validate credentials and return ``(jwt, expires_iso)``. + ``workspace`` is optional; defaults at the server to the + OSS default workspace.""" + resp = await self._request( + operation="login", + workspace=workspace, + username=username, + password=password, + timeout=timeout, + ) + return resp.jwt, resp.jwt_expires + + async def get_signing_key_public(self, timeout=IAM_TIMEOUT): + """Return the active JWT signing public key in PEM. The + gateway calls this at startup and caches the result.""" + resp = await self._request( + operation="get-signing-key-public", + timeout=timeout, + ) + return resp.signing_key_public + + async def change_password(self, user_id, current_password, + new_password, timeout=IAM_TIMEOUT): + await self._request( + operation="change-password", + user_id=user_id, + password=current_password, + new_password=new_password, + timeout=timeout, + ) + + async def reset_password(self, workspace, user_id, actor="", + timeout=IAM_TIMEOUT): + """Admin-driven password reset. Returns the plaintext + temporary password (returned once).""" + resp = await self._request( + operation="reset-password", + workspace=workspace, + actor=actor, + user_id=user_id, + timeout=timeout, + ) + return resp.temporary_password + + async def get_user(self, workspace, user_id, actor="", + timeout=IAM_TIMEOUT): + resp = await self._request( + operation="get-user", + workspace=workspace, + actor=actor, + user_id=user_id, + timeout=timeout, + ) + return resp.user + + async def update_user(self, workspace, user_id, user, actor="", + timeout=IAM_TIMEOUT): + resp = await self._request( + operation="update-user", + workspace=workspace, + actor=actor, + user_id=user_id, + user=user, + timeout=timeout, + ) + return resp.user + + async def disable_user(self, workspace, user_id, actor="", + timeout=IAM_TIMEOUT): + await self._request( + operation="disable-user", + workspace=workspace, + actor=actor, + user_id=user_id, + timeout=timeout, + ) + + async def enable_user(self, workspace, user_id, actor="", + timeout=IAM_TIMEOUT): + await self._request( + operation="enable-user", + workspace=workspace, + actor=actor, + user_id=user_id, + timeout=timeout, + ) + + async def delete_user(self, workspace, user_id, actor="", + timeout=IAM_TIMEOUT): + await self._request( + operation="delete-user", + workspace=workspace, + actor=actor, + user_id=user_id, + timeout=timeout, + ) + + async def create_workspace(self, workspace_record, actor="", + timeout=IAM_TIMEOUT): + resp = await self._request( + operation="create-workspace", + actor=actor, + workspace_record=workspace_record, + timeout=timeout, + ) + return resp.workspace + + async def list_workspaces(self, actor="", timeout=IAM_TIMEOUT): + resp = await self._request( + operation="list-workspaces", + actor=actor, + timeout=timeout, + ) + return list(resp.workspaces) + + async def get_workspace(self, workspace_id, actor="", + timeout=IAM_TIMEOUT): + from ..schema import WorkspaceInput + resp = await self._request( + operation="get-workspace", + actor=actor, + workspace_record=WorkspaceInput(id=workspace_id), + timeout=timeout, + ) + return resp.workspace + + async def update_workspace(self, workspace_record, actor="", + timeout=IAM_TIMEOUT): + resp = await self._request( + operation="update-workspace", + actor=actor, + workspace_record=workspace_record, + timeout=timeout, + ) + return resp.workspace + + async def disable_workspace(self, workspace_id, actor="", + timeout=IAM_TIMEOUT): + from ..schema import WorkspaceInput + await self._request( + operation="disable-workspace", + actor=actor, + workspace_record=WorkspaceInput(id=workspace_id), + timeout=timeout, + ) + + async def rotate_signing_key(self, actor="", timeout=IAM_TIMEOUT): + await self._request( + operation="rotate-signing-key", + actor=actor, + timeout=timeout, + ) + + +class IamClientSpec(RequestResponseSpec): + def __init__(self, request_name, response_name): + super().__init__( + request_name=request_name, + request_schema=IamRequest, + response_name=response_name, + response_schema=IamResponse, + impl=IamClient, + ) diff --git a/trustgraph-base/trustgraph/messaging/__init__.py b/trustgraph-base/trustgraph/messaging/__init__.py index 30f5061c..9fcfa6f7 100644 --- a/trustgraph-base/trustgraph/messaging/__init__.py +++ b/trustgraph-base/trustgraph/messaging/__init__.py @@ -15,6 +15,7 @@ from .translators.library import LibraryRequestTranslator, LibraryResponseTransl from .translators.document_loading import DocumentTranslator, TextDocumentTranslator from .translators.config import ConfigRequestTranslator, ConfigResponseTranslator from .translators.flow import FlowRequestTranslator, FlowResponseTranslator +from .translators.iam import IamRequestTranslator, IamResponseTranslator from .translators.prompt import PromptRequestTranslator, PromptResponseTranslator from .translators.tool import ToolRequestTranslator, ToolResponseTranslator from .translators.embeddings_query import ( @@ -85,11 +86,17 @@ TranslatorRegistry.register_service( ) TranslatorRegistry.register_service( - "flow", - FlowRequestTranslator(), + "flow", + FlowRequestTranslator(), FlowResponseTranslator() ) +TranslatorRegistry.register_service( + "iam", + IamRequestTranslator(), + IamResponseTranslator() +) + TranslatorRegistry.register_service( "prompt", PromptRequestTranslator(), diff --git a/trustgraph-base/trustgraph/messaging/translators/iam.py b/trustgraph-base/trustgraph/messaging/translators/iam.py new file mode 100644 index 00000000..1d7bf21c --- /dev/null +++ b/trustgraph-base/trustgraph/messaging/translators/iam.py @@ -0,0 +1,198 @@ +from typing import Dict, Any, Tuple + +from ...schema import IamRequest, IamResponse +from ...schema import ( + UserInput, UserRecord, + WorkspaceInput, WorkspaceRecord, + ApiKeyInput, ApiKeyRecord, +) +from .base import MessageTranslator + + +def _user_input_from_dict(d): + if d is None: + return None + return UserInput( + username=d.get("username", ""), + name=d.get("name", ""), + email=d.get("email", ""), + password=d.get("password", ""), + roles=list(d.get("roles", [])), + enabled=d.get("enabled", True), + must_change_password=d.get("must_change_password", False), + ) + + +def _workspace_input_from_dict(d): + if d is None: + return None + return WorkspaceInput( + id=d.get("id", ""), + name=d.get("name", ""), + enabled=d.get("enabled", True), + ) + + +def _api_key_input_from_dict(d): + if d is None: + return None + return ApiKeyInput( + user_id=d.get("user_id", ""), + name=d.get("name", ""), + expires=d.get("expires", ""), + ) + + +def _user_record_to_dict(r): + if r is None: + return None + return { + "id": r.id, + "workspace": r.workspace, + "username": r.username, + "name": r.name, + "email": r.email, + "roles": list(r.roles), + "enabled": r.enabled, + "must_change_password": r.must_change_password, + "created": r.created, + } + + +def _workspace_record_to_dict(r): + if r is None: + return None + return { + "id": r.id, + "name": r.name, + "enabled": r.enabled, + "created": r.created, + } + + +def _api_key_record_to_dict(r): + if r is None: + return None + return { + "id": r.id, + "user_id": r.user_id, + "name": r.name, + "prefix": r.prefix, + "expires": r.expires, + "created": r.created, + "last_used": r.last_used, + } + + +class IamRequestTranslator(MessageTranslator): + + def decode(self, data: Dict[str, Any]) -> IamRequest: + return IamRequest( + operation=data.get("operation", ""), + workspace=data.get("workspace", ""), + actor=data.get("actor", ""), + user_id=data.get("user_id", ""), + username=data.get("username", ""), + key_id=data.get("key_id", ""), + api_key=data.get("api_key", ""), + password=data.get("password", ""), + new_password=data.get("new_password", ""), + user=_user_input_from_dict(data.get("user")), + workspace_record=_workspace_input_from_dict( + data.get("workspace_record") + ), + key=_api_key_input_from_dict(data.get("key")), + ) + + def encode(self, obj: IamRequest) -> Dict[str, Any]: + result = {"operation": obj.operation} + for fname in ( + "workspace", "actor", "user_id", "username", "key_id", + "api_key", "password", "new_password", + ): + v = getattr(obj, fname, "") + if v: + result[fname] = v + if obj.user is not None: + result["user"] = { + "username": obj.user.username, + "name": obj.user.name, + "email": obj.user.email, + "password": obj.user.password, + "roles": list(obj.user.roles), + "enabled": obj.user.enabled, + "must_change_password": obj.user.must_change_password, + } + if obj.workspace_record is not None: + result["workspace_record"] = { + "id": obj.workspace_record.id, + "name": obj.workspace_record.name, + "enabled": obj.workspace_record.enabled, + } + if obj.key is not None: + result["key"] = { + "user_id": obj.key.user_id, + "name": obj.key.name, + "expires": obj.key.expires, + } + return result + + +class IamResponseTranslator(MessageTranslator): + + def decode(self, data: Dict[str, Any]) -> IamResponse: + raise NotImplementedError( + "IamResponse is a server-produced message; no HTTP→schema " + "path is needed" + ) + + def encode(self, obj: IamResponse) -> Dict[str, Any]: + result: Dict[str, Any] = {} + + if obj.user is not None: + result["user"] = _user_record_to_dict(obj.user) + if obj.users: + result["users"] = [_user_record_to_dict(u) for u in obj.users] + if obj.workspace is not None: + result["workspace"] = _workspace_record_to_dict(obj.workspace) + if obj.workspaces: + result["workspaces"] = [ + _workspace_record_to_dict(w) for w in obj.workspaces + ] + if obj.api_key_plaintext: + result["api_key_plaintext"] = obj.api_key_plaintext + if obj.api_key is not None: + result["api_key"] = _api_key_record_to_dict(obj.api_key) + if obj.api_keys: + result["api_keys"] = [ + _api_key_record_to_dict(k) for k in obj.api_keys + ] + if obj.jwt: + result["jwt"] = obj.jwt + if obj.jwt_expires: + result["jwt_expires"] = obj.jwt_expires + if obj.signing_key_public: + result["signing_key_public"] = obj.signing_key_public + if obj.resolved_user_id: + result["resolved_user_id"] = obj.resolved_user_id + if obj.resolved_workspace: + result["resolved_workspace"] = obj.resolved_workspace + if obj.resolved_roles: + result["resolved_roles"] = list(obj.resolved_roles) + if obj.temporary_password: + result["temporary_password"] = obj.temporary_password + if obj.bootstrap_admin_user_id: + result["bootstrap_admin_user_id"] = obj.bootstrap_admin_user_id + if obj.bootstrap_admin_api_key: + result["bootstrap_admin_api_key"] = obj.bootstrap_admin_api_key + # bootstrap-status: emit unconditionally — the false case is + # meaningful for UIs deciding whether to render first-run + # setup, so it can't be dropped by a truthy-only filter. + result["bootstrap_available"] = bool(obj.bootstrap_available) + + return result + + def encode_with_completion( + self, obj: IamResponse, + ) -> Tuple[Dict[str, Any], bool]: + return self.encode(obj), True diff --git a/trustgraph-base/trustgraph/schema/services/__init__.py b/trustgraph-base/trustgraph/schema/services/__init__.py index 550b7d12..2a214201 100644 --- a/trustgraph-base/trustgraph/schema/services/__init__.py +++ b/trustgraph-base/trustgraph/schema/services/__init__.py @@ -5,6 +5,7 @@ from .agent import * from .flow import * from .prompt import * from .config import * +from .iam import * from .library import * from .lookup import * from .nlp_query import * diff --git a/trustgraph-base/trustgraph/schema/services/iam.py b/trustgraph-base/trustgraph/schema/services/iam.py new file mode 100644 index 00000000..797d6203 --- /dev/null +++ b/trustgraph-base/trustgraph/schema/services/iam.py @@ -0,0 +1,173 @@ + +from dataclasses import dataclass, field + +from ..core.topic import queue +from ..core.primitives import Error + +############################################################################ + +# IAM service — see docs/tech-specs/iam-protocol.md for the full protocol. +# +# Transport: request/response pub/sub, correlated by the `id` message +# property. Caller is the API gateway only; the IAM service trusts +# the bus per the enforcement-boundary policy (no per-request auth +# against the caller). + + +@dataclass +class UserInput: + username: str = "" + name: str = "" + email: str = "" + # Only populated on create-user; never on update-user. + password: str = "" + roles: list[str] = field(default_factory=list) + enabled: bool = True + must_change_password: bool = False + + +@dataclass +class UserRecord: + id: str = "" + workspace: str = "" + username: str = "" + name: str = "" + email: str = "" + roles: list[str] = field(default_factory=list) + enabled: bool = True + must_change_password: bool = False + created: str = "" + + +@dataclass +class WorkspaceInput: + id: str = "" + name: str = "" + enabled: bool = True + + +@dataclass +class WorkspaceRecord: + id: str = "" + name: str = "" + enabled: bool = True + created: str = "" + + +@dataclass +class ApiKeyInput: + user_id: str = "" + name: str = "" + expires: str = "" + + +@dataclass +class ApiKeyRecord: + id: str = "" + user_id: str = "" + name: str = "" + # First 4 chars of the plaintext token, for operator identification + # in list-api-keys. Never enough to reconstruct the key. + prefix: str = "" + expires: str = "" + created: str = "" + last_used: str = "" + + +@dataclass +class IamRequest: + operation: str = "" + + # Workspace scope. Required on workspace-scoped operations; + # omitted for system-level ops (workspace CRUD, signing-key + # ops, bootstrap, resolve-api-key, login). + workspace: str = "" + + # Acting user id for audit. Empty for internal-origin and for + # operations that resolve an identity (login, resolve-api-key). + actor: str = "" + + user_id: str = "" + username: str = "" + key_id: str = "" + api_key: str = "" + + password: str = "" + new_password: str = "" + + user: UserInput | None = None + workspace_record: WorkspaceInput | None = None + key: ApiKeyInput | None = None + + # ---- authorise / authorise-many inputs ---- + # Capability string from the vocabulary in capabilities.md. + capability: str = "" + # Resource identifier as JSON. See the IAM contract spec for + # the resource-component vocabulary. An empty dict denotes a + # system-level resource. + resource_json: str = "" + # Operation parameters as JSON. Decision-relevant fields the + # operation supplied that are not part of the resource address + # (e.g. workspace association on create-user). + parameters_json: str = "" + # For authorise-many: a JSON-serialised list of + # {"capability": str, "resource": dict, "parameters": dict}. + authorise_checks: str = "" + + +@dataclass +class IamResponse: + user: UserRecord | None = None + users: list[UserRecord] = field(default_factory=list) + + workspace: WorkspaceRecord | None = None + workspaces: list[WorkspaceRecord] = field(default_factory=list) + + # create-api-key returns the plaintext once; never populated + # on any other operation. + api_key_plaintext: str = "" + api_key: ApiKeyRecord | None = None + api_keys: list[ApiKeyRecord] = field(default_factory=list) + + # login, rotate-signing-key + jwt: str = "" + jwt_expires: str = "" + + # get-signing-key-public + signing_key_public: str = "" + + # resolve-api-key + resolved_user_id: str = "" + resolved_workspace: str = "" + resolved_roles: list[str] = field(default_factory=list) + + # reset-password + temporary_password: str = "" + + # bootstrap + bootstrap_admin_user_id: str = "" + bootstrap_admin_api_key: str = "" + + # bootstrap-status — true iff iam-svc is in 'bootstrap' mode with + # empty tables, i.e. an unconsumed bootstrap call would succeed. + bootstrap_available: bool = False + + # ---- authorise / authorise-many outputs ---- + # authorise: the regime's allow / deny verdict. + decision_allow: bool = False + # Cache TTL the regime suggests, in seconds. Gateway respects + # this for both allow and deny decisions; bounded above by + # gateway-side policy (typically <= 60s). + decision_ttl_seconds: int = 0 + # authorise-many: a JSON-serialised list of {"allow": bool, + # "ttl": int} in the same order as the request's + # authorise_checks. + decisions_json: str = "" + + error: Error | None = None + + +iam_request_queue = queue('iam', cls='request') +iam_response_queue = queue('iam', cls='response') + +############################################################################ diff --git a/trustgraph-cli/pyproject.toml b/trustgraph-cli/pyproject.toml index a5738449..e8062fba 100644 --- a/trustgraph-cli/pyproject.toml +++ b/trustgraph-cli/pyproject.toml @@ -40,7 +40,22 @@ tg-get-flow-blueprint = "trustgraph.cli.get_flow_blueprint:main" tg-get-kg-core = "trustgraph.cli.get_kg_core:main" tg-get-document-content = "trustgraph.cli.get_document_content:main" tg-graph-to-turtle = "trustgraph.cli.graph_to_turtle:main" -tg-init-trustgraph = "trustgraph.cli.init_trustgraph:main" +tg-bootstrap-iam = "trustgraph.cli.bootstrap_iam:main" +tg-login = "trustgraph.cli.login:main" +tg-create-user = "trustgraph.cli.create_user:main" +tg-list-users = "trustgraph.cli.list_users:main" +tg-whoami = "trustgraph.cli.whoami:main" +tg-update-user = "trustgraph.cli.update_user:main" +tg-disable-user = "trustgraph.cli.disable_user:main" +tg-enable-user = "trustgraph.cli.enable_user:main" +tg-delete-user = "trustgraph.cli.delete_user:main" +tg-change-password = "trustgraph.cli.change_password:main" +tg-reset-password = "trustgraph.cli.reset_password:main" +tg-create-api-key = "trustgraph.cli.create_api_key:main" +tg-list-api-keys = "trustgraph.cli.list_api_keys:main" +tg-revoke-api-key = "trustgraph.cli.revoke_api_key:main" +tg-list-workspaces = "trustgraph.cli.list_workspaces:main" +tg-create-workspace = "trustgraph.cli.create_workspace:main" tg-invoke-agent = "trustgraph.cli.invoke_agent:main" tg-invoke-document-rag = "trustgraph.cli.invoke_document_rag:main" tg-invoke-graph-rag = "trustgraph.cli.invoke_graph_rag:main" diff --git a/trustgraph-cli/trustgraph/cli/_iam.py b/trustgraph-cli/trustgraph/cli/_iam.py new file mode 100644 index 00000000..f5278c0c --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/_iam.py @@ -0,0 +1,75 @@ +""" +Shared helpers for IAM CLI tools. + +All IAM operations go through the gateway's ``/api/v1/iam`` forwarder, +with the three public auth operations (``login``, ``bootstrap``, +``change-password``) served via ``/api/v1/auth/...`` instead. These +helpers encapsulate the HTTP plumbing so each CLI can stay focused +on its own argument parsing and output formatting. +""" + +import json +import os +import sys + +import requests + + +DEFAULT_URL = os.getenv("TRUSTGRAPH_URL", "http://localhost:8088/") +DEFAULT_TOKEN = os.getenv("TRUSTGRAPH_TOKEN", None) + + +def _fmt_error(resp_json): + err = resp_json.get("error", {}) + if isinstance(err, dict): + t = err.get("type", "") + m = err.get("message", "") + return f"{t}: {m}" if t else m or "error" + return str(err) + + +def _post(url, path, token, body): + endpoint = url.rstrip("/") + path + headers = {"Content-Type": "application/json"} + if token: + headers["Authorization"] = f"Bearer {token}" + + resp = requests.post( + endpoint, headers=headers, data=json.dumps(body), + ) + + if resp.status_code != 200: + try: + payload = resp.json() + detail = _fmt_error(payload) + except Exception: + detail = resp.text + raise RuntimeError(f"HTTP {resp.status_code}: {detail}") + + body = resp.json() + if "error" in body: + raise RuntimeError(_fmt_error(body)) + return body + + +def call_iam(url, token, request): + """Forward an IAM request through ``/api/v1/iam``. ``request`` is + the ``IamRequest`` dict shape.""" + return _post(url, "/api/v1/iam", token, request) + + +def call_auth(url, path, token, body): + """Hit one of the public auth endpoints + (``/api/v1/auth/login``, ``/api/v1/auth/change-password``, etc.). + ``token`` is optional — login and bootstrap don't need one.""" + return _post(url, path, token, body) + + +def run_main(fn, parser): + """Standard error-handling wrapper for CLI main() bodies.""" + args = parser.parse_args() + try: + fn(args) + except Exception as e: + print("Exception:", e, file=sys.stderr, flush=True) + sys.exit(1) diff --git a/trustgraph-cli/trustgraph/cli/bootstrap_iam.py b/trustgraph-cli/trustgraph/cli/bootstrap_iam.py new file mode 100644 index 00000000..99a789e2 --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/bootstrap_iam.py @@ -0,0 +1,94 @@ +""" +Bootstraps the IAM service. Only works when iam-svc is running in +bootstrap mode with empty tables. Prints the initial admin API key +to stdout. + +This is a one-time, trust-sensitive operation. The resulting token +is shown once and never again — capture it on use. Rotate and +revoke it as soon as a real admin API key has been issued. +""" + +import argparse +import json +import os +import sys + +import requests + +default_url = os.getenv("TRUSTGRAPH_URL", "http://localhost:8088/") + + +def bootstrap(url): + + # Unauthenticated public endpoint — IAM refuses the bootstrap + # operation unless the service is running in bootstrap mode with + # empty tables, so the safety gate lives on the server side. + endpoint = url.rstrip("/") + "/api/v1/auth/bootstrap" + + headers = {"Content-Type": "application/json"} + + resp = requests.post( + endpoint, + headers=headers, + data=json.dumps({}), + ) + + if resp.status_code != 200: + raise RuntimeError( + f"HTTP {resp.status_code}: {resp.text}" + ) + + body = resp.json() + + if "error" in body: + raise RuntimeError( + f"IAM {body['error'].get('type', 'error')}: " + f"{body['error'].get('message', '')}" + ) + + api_key = body.get("bootstrap_admin_api_key") + user_id = body.get("bootstrap_admin_user_id") + + if not api_key: + raise RuntimeError( + "IAM response did not contain a bootstrap token — the " + "service may already be bootstrapped, or may be running " + "in token mode." + ) + + return user_id, api_key + + +def main(): + + parser = argparse.ArgumentParser( + prog="tg-bootstrap-iam", + description=__doc__, + ) + + parser.add_argument( + "-u", "--api-url", + default=default_url, + help=f"API URL (default: {default_url})", + ) + + args = parser.parse_args() + + try: + user_id, api_key = bootstrap(args.api_url) + except Exception as e: + print("Exception:", e, file=sys.stderr, flush=True) + sys.exit(1) + + # Stdout gets machine-readable output (the key). Any operator + # context goes to stderr. + print(f"Admin user id: {user_id}", file=sys.stderr) + print( + "Admin API key (shown once, capture now):", + file=sys.stderr, + ) + print(api_key) + + +if __name__ == "__main__": + main() diff --git a/trustgraph-cli/trustgraph/cli/change_password.py b/trustgraph-cli/trustgraph/cli/change_password.py new file mode 100644 index 00000000..c914b30f --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/change_password.py @@ -0,0 +1,46 @@ +""" +Change your own password. Requires the current password. +""" + +import argparse +import getpass + +from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_auth, run_main + + +def do_change_password(args): + current = args.current or getpass.getpass("Current password: ") + new = args.new or getpass.getpass("New password: ") + + call_auth( + args.api_url, "/api/v1/auth/change-password", args.token, + {"current_password": current, "new_password": new}, + ) + print("Password changed.") + + +def main(): + parser = argparse.ArgumentParser( + prog="tg-change-password", description=__doc__, + ) + parser.add_argument( + "-u", "--api-url", default=DEFAULT_URL, + help=f"API URL (default: {DEFAULT_URL})", + ) + parser.add_argument( + "-t", "--token", default=DEFAULT_TOKEN, + help="Auth token (default: $TRUSTGRAPH_TOKEN)", + ) + parser.add_argument( + "--current", default=None, + help="Current password (prompted if omitted)", + ) + parser.add_argument( + "--new", default=None, + help="New password (prompted if omitted)", + ) + run_main(do_change_password, parser) + + +if __name__ == "__main__": + main() diff --git a/trustgraph-cli/trustgraph/cli/create_api_key.py b/trustgraph-cli/trustgraph/cli/create_api_key.py new file mode 100644 index 00000000..2b269041 --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/create_api_key.py @@ -0,0 +1,71 @@ +""" +Create an API key for a user. Prints the plaintext key to stdout — +shown once only. +""" + +import argparse +import sys + +from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main + + +def do_create_api_key(args): + key = { + "user_id": args.user_id, + "name": args.name, + } + if args.expires: + key["expires"] = args.expires + + req = {"operation": "create-api-key", "key": key} + if args.workspace: + req["workspace"] = args.workspace + resp = call_iam(args.api_url, args.token, req) + + plaintext = resp.get("api_key_plaintext", "") + rec = resp.get("api_key", {}) + print(f"Key id: {rec.get('id', '')}", file=sys.stderr) + print(f"Name: {rec.get('name', '')}", file=sys.stderr) + print(f"Prefix: {rec.get('prefix', '')}", file=sys.stderr) + print( + "API key (shown once, capture now):", file=sys.stderr, + ) + print(plaintext) + + +def main(): + parser = argparse.ArgumentParser( + prog="tg-create-api-key", description=__doc__, + ) + parser.add_argument( + "-u", "--api-url", default=DEFAULT_URL, + help=f"API URL (default: {DEFAULT_URL})", + ) + parser.add_argument( + "-t", "--token", default=DEFAULT_TOKEN, + help="Auth token (default: $TRUSTGRAPH_TOKEN)", + ) + parser.add_argument( + "--user-id", required=True, + help="Owner user id", + ) + parser.add_argument( + "--name", required=True, + help="Operator-facing label (e.g. 'laptop', 'ci')", + ) + parser.add_argument( + "--expires", default=None, + help="ISO-8601 expiry (optional; empty = no expiry)", + ) + parser.add_argument( + "-w", "--workspace", default=None, + help=( + "Target workspace (admin only; defaults to caller's " + "assigned workspace)" + ), + ) + run_main(do_create_api_key, parser) + + +if __name__ == "__main__": + main() diff --git a/trustgraph-cli/trustgraph/cli/create_user.py b/trustgraph-cli/trustgraph/cli/create_user.py new file mode 100644 index 00000000..c9253aca --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/create_user.py @@ -0,0 +1,87 @@ +""" +Create a user in the caller's workspace. Prints the new user id. +""" + +import argparse +import getpass +import sys + +from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main + + +def do_create_user(args): + password = args.password + if not password: + password = getpass.getpass( + f"Password for new user {args.username}: " + ) + + user = { + "username": args.username, + "password": password, + "roles": args.roles, + } + if args.name: + user["name"] = args.name + if args.email: + user["email"] = args.email + if args.must_change_password: + user["must_change_password"] = True + + req = {"operation": "create-user", "user": user} + if args.workspace: + req["workspace"] = args.workspace + resp = call_iam(args.api_url, args.token, req) + + rec = resp.get("user", {}) + print(f"User id: {rec.get('id', '')}", file=sys.stderr) + print(f"Username: {rec.get('username', '')}", file=sys.stderr) + print(f"Roles: {', '.join(rec.get('roles', []))}", file=sys.stderr) + print(rec.get("id", "")) + + +def main(): + parser = argparse.ArgumentParser( + prog="tg-create-user", description=__doc__, + ) + parser.add_argument( + "-u", "--api-url", default=DEFAULT_URL, + help=f"API URL (default: {DEFAULT_URL})", + ) + parser.add_argument( + "-t", "--token", default=DEFAULT_TOKEN, + help="Auth token (default: $TRUSTGRAPH_TOKEN)", + ) + parser.add_argument( + "--username", required=True, help="Username (unique in workspace)", + ) + parser.add_argument( + "--password", default=None, + help="Password (prompted if omitted)", + ) + parser.add_argument( + "--name", default=None, help="Display name", + ) + parser.add_argument( + "--email", default=None, help="Email", + ) + parser.add_argument( + "--roles", nargs="+", default=["reader"], + help="One or more role names (default: reader)", + ) + parser.add_argument( + "--must-change-password", action="store_true", + help="Force password change on next login", + ) + parser.add_argument( + "-w", "--workspace", default=None, + help=( + "Target workspace (admin only; defaults to caller's " + "assigned workspace)" + ), + ) + run_main(do_create_user, parser) + + +if __name__ == "__main__": + main() diff --git a/trustgraph-cli/trustgraph/cli/create_workspace.py b/trustgraph-cli/trustgraph/cli/create_workspace.py new file mode 100644 index 00000000..f8367720 --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/create_workspace.py @@ -0,0 +1,46 @@ +""" +Create a workspace (system-level; requires admin). +""" + +import argparse + +from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main + + +def do_create_workspace(args): + ws = {"id": args.workspace_id, "enabled": True} + if args.name: + ws["name"] = args.name + + resp = call_iam(args.api_url, args.token, { + "operation": "create-workspace", + "workspace_record": ws, + }) + rec = resp.get("workspace", {}) + print(f"Workspace created: {rec.get('id', '')}") + + +def main(): + parser = argparse.ArgumentParser( + prog="tg-create-workspace", description=__doc__, + ) + parser.add_argument( + "-u", "--api-url", default=DEFAULT_URL, + help=f"API URL (default: {DEFAULT_URL})", + ) + parser.add_argument( + "-t", "--token", default=DEFAULT_TOKEN, + help="Auth token (default: $TRUSTGRAPH_TOKEN)", + ) + parser.add_argument( + "--workspace-id", required=True, + help="New workspace id (must not start with '_')", + ) + parser.add_argument( + "--name", default=None, help="Display name", + ) + run_main(do_create_workspace, parser) + + +if __name__ == "__main__": + main() diff --git a/trustgraph-cli/trustgraph/cli/delete_user.py b/trustgraph-cli/trustgraph/cli/delete_user.py new file mode 100644 index 00000000..dbdf7877 --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/delete_user.py @@ -0,0 +1,62 @@ +""" +Delete a user. Removes the user record, their username lookup, +and all their API keys. The freed username becomes available for +re-use. + +Irreversible. Use tg-disable-user if you want to preserve the +record (audit trail, username squatting protection). +""" + +import argparse + +from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main + + +def do_delete_user(args): + if not args.yes: + confirm = input( + f"Delete user {args.user_id}? This is irreversible. " + f"[type 'yes' to confirm]: " + ) + if confirm.strip() != "yes": + print("Aborted.") + return + + req = {"operation": "delete-user", "user_id": args.user_id} + if args.workspace: + req["workspace"] = args.workspace + call_iam(args.api_url, args.token, req) + print(f"Deleted user {args.user_id}") + + +def main(): + parser = argparse.ArgumentParser( + prog="tg-delete-user", description=__doc__, + ) + parser.add_argument( + "-u", "--api-url", default=DEFAULT_URL, + help=f"API URL (default: {DEFAULT_URL})", + ) + parser.add_argument( + "-t", "--token", default=DEFAULT_TOKEN, + help="Auth token (default: $TRUSTGRAPH_TOKEN)", + ) + parser.add_argument( + "--user-id", required=True, help="User id to delete", + ) + parser.add_argument( + "-w", "--workspace", default=None, + help=( + "Target workspace (admin only; defaults to caller's " + "assigned workspace)" + ), + ) + parser.add_argument( + "--yes", action="store_true", + help="Skip the interactive confirmation prompt", + ) + run_main(do_delete_user, parser) + + +if __name__ == "__main__": + main() diff --git a/trustgraph-cli/trustgraph/cli/disable_user.py b/trustgraph-cli/trustgraph/cli/disable_user.py new file mode 100644 index 00000000..e142644b --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/disable_user.py @@ -0,0 +1,45 @@ +""" +Disable a user. Soft-deletes (enabled=false) and revokes all their +API keys. +""" + +import argparse + +from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main + + +def do_disable_user(args): + req = {"operation": "disable-user", "user_id": args.user_id} + if args.workspace: + req["workspace"] = args.workspace + call_iam(args.api_url, args.token, req) + print(f"Disabled user {args.user_id}") + + +def main(): + parser = argparse.ArgumentParser( + prog="tg-disable-user", description=__doc__, + ) + parser.add_argument( + "-u", "--api-url", default=DEFAULT_URL, + help=f"API URL (default: {DEFAULT_URL})", + ) + parser.add_argument( + "-t", "--token", default=DEFAULT_TOKEN, + help="Auth token (default: $TRUSTGRAPH_TOKEN)", + ) + parser.add_argument( + "--user-id", required=True, help="User id to disable", + ) + parser.add_argument( + "-w", "--workspace", default=None, + help=( + "Target workspace (admin only; defaults to caller's " + "assigned workspace)" + ), + ) + run_main(do_disable_user, parser) + + +if __name__ == "__main__": + main() diff --git a/trustgraph-cli/trustgraph/cli/enable_user.py b/trustgraph-cli/trustgraph/cli/enable_user.py new file mode 100644 index 00000000..c762366a --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/enable_user.py @@ -0,0 +1,45 @@ +""" +Re-enable a previously disabled user. Does not restore their API +keys — those must be re-issued by an admin. +""" + +import argparse + +from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main + + +def do_enable_user(args): + req = {"operation": "enable-user", "user_id": args.user_id} + if args.workspace: + req["workspace"] = args.workspace + call_iam(args.api_url, args.token, req) + print(f"Enabled user {args.user_id}") + + +def main(): + parser = argparse.ArgumentParser( + prog="tg-enable-user", description=__doc__, + ) + parser.add_argument( + "-u", "--api-url", default=DEFAULT_URL, + help=f"API URL (default: {DEFAULT_URL})", + ) + parser.add_argument( + "-t", "--token", default=DEFAULT_TOKEN, + help="Auth token (default: $TRUSTGRAPH_TOKEN)", + ) + parser.add_argument( + "--user-id", required=True, help="User id to enable", + ) + parser.add_argument( + "-w", "--workspace", default=None, + help=( + "Target workspace (admin only; defaults to caller's " + "assigned workspace)" + ), + ) + run_main(do_enable_user, parser) + + +if __name__ == "__main__": + main() diff --git a/trustgraph-cli/trustgraph/cli/init_trustgraph.py b/trustgraph-cli/trustgraph/cli/init_trustgraph.py deleted file mode 100644 index d984f925..00000000 --- a/trustgraph-cli/trustgraph/cli/init_trustgraph.py +++ /dev/null @@ -1,271 +0,0 @@ -""" -Initialises TrustGraph pub/sub infrastructure and pushes initial config. - -For Pulsar: creates tenant, namespaces, and retention policies. -For RabbitMQ: queues are auto-declared, so only config push is needed. -""" - -import requests -import time -import argparse -import json - -from trustgraph.clients.config_client import ConfigClient -from trustgraph.base.pubsub import add_pubsub_args - -default_pulsar_admin_url = "http://pulsar:8080" -subscriber = "tg-init-pubsub" - - -def get_clusters(url): - - print("Get clusters...", flush=True) - - resp = requests.get(f"{url}/admin/v2/clusters") - - if resp.status_code != 200: raise RuntimeError("Could not fetch clusters") - - return resp.json() - -def ensure_tenant(url, tenant, clusters): - - resp = requests.get(f"{url}/admin/v2/tenants/{tenant}") - - if resp.status_code == 200: - print(f"Tenant {tenant} already exists.", flush=True) - return - - resp = requests.put( - f"{url}/admin/v2/tenants/{tenant}", - json={ - "adminRoles": [], - "allowedClusters": clusters, - } - ) - - if resp.status_code != 204: - print(resp.text, flush=True) - raise RuntimeError("Tenant creation failed.") - - print(f"Tenant {tenant} created.", flush=True) - -def ensure_namespace(url, tenant, namespace, config): - - resp = requests.get(f"{url}/admin/v2/namespaces/{tenant}/{namespace}") - - if resp.status_code == 200: - print(f"Namespace {tenant}/{namespace} already exists.", flush=True) - return - - resp = requests.put( - f"{url}/admin/v2/namespaces/{tenant}/{namespace}", - json=config, - ) - - if resp.status_code != 204: - print(resp.status_code, flush=True) - print(resp.text, flush=True) - raise RuntimeError(f"Namespace {tenant}/{namespace} creation failed.") - - print(f"Namespace {tenant}/{namespace} created.", flush=True) - -def ensure_config(config, workspace="default", **pubsub_config): - - cli = ConfigClient( - subscriber=subscriber, - workspace=workspace, - **pubsub_config, - ) - - while True: - - try: - - print("Get current config...", flush=True) - current, version = cli.config(timeout=5) - - except Exception as e: - - print("Exception:", e, flush=True) - time.sleep(2) - print("Retrying...", flush=True) - continue - - print("Current config version is", version, flush=True) - - if version != 0: - print("Already updated, not updating config. Done.", flush=True) - return - - print("Config is version 0, updating...", flush=True) - - batch = [] - - for type in config: - for key in config[type]: - print(f"Adding {type}/{key} to update.", flush=True) - batch.append({ - "type": type, - "key": key, - "value": json.dumps(config[type][key]), - }) - - try: - cli.put(batch, timeout=10) - print("Update succeeded.", flush=True) - break - except Exception as e: - print("Exception:", e, flush=True) - time.sleep(2) - print("Retrying...", flush=True) - continue - -def init_pulsar(pulsar_admin_url, tenant): - """Pulsar-specific setup: create tenant, namespaces, retention policies.""" - - clusters = get_clusters(pulsar_admin_url) - - ensure_tenant(pulsar_admin_url, tenant, clusters) - - ensure_namespace(pulsar_admin_url, tenant, "flow", {}) - - ensure_namespace(pulsar_admin_url, tenant, "request", {}) - - ensure_namespace(pulsar_admin_url, tenant, "response", { - "retention_policies": { - "retentionSizeInMB": -1, - "retentionTimeInMinutes": 3, - "subscriptionExpirationTimeMinutes": 30, - } - }) - - ensure_namespace(pulsar_admin_url, tenant, "notify", { - "retention_policies": { - "retentionSizeInMB": -1, - "retentionTimeInMinutes": 3, - "subscriptionExpirationTimeMinutes": 5, - } - }) - - -def push_config(config_json, config_file, workspace="default", - **pubsub_config): - """Push initial config if provided.""" - - if config_json is not None: - - try: - print("Decoding config...", flush=True) - dec = json.loads(config_json) - print("Decoded.", flush=True) - except Exception as e: - print("Exception:", e, flush=True) - raise e - - ensure_config(dec, workspace=workspace, **pubsub_config) - - elif config_file is not None: - - try: - print("Decoding config...", flush=True) - dec = json.load(open(config_file)) - print("Decoded.", flush=True) - except Exception as e: - print("Exception:", e, flush=True) - raise e - - ensure_config(dec, workspace=workspace, **pubsub_config) - - else: - print("No config to update.", flush=True) - - -def main(): - - parser = argparse.ArgumentParser( - prog='tg-init-trustgraph', - description=__doc__, - ) - - parser.add_argument( - '--pulsar-admin-url', - default=default_pulsar_admin_url, - help=f'Pulsar admin URL (default: {default_pulsar_admin_url})', - ) - - parser.add_argument( - '-c', '--config', - help=f'Initial configuration to load', - ) - - parser.add_argument( - '-C', '--config-file', - help=f'Initial configuration to load from file', - ) - - parser.add_argument( - '-t', '--tenant', - default="tg", - help=f'Tenant (default: tg)', - ) - - parser.add_argument( - '-w', '--workspace', - default="default", - help=f'Workspace (default: default)', - ) - - add_pubsub_args(parser) - - args = parser.parse_args() - - backend_type = args.pubsub_backend - - # Extract pubsub config from args - pubsub_config = { - k: v for k, v in vars(args).items() - if k not in ( - 'pulsar_admin_url', 'config', 'config_file', 'tenant', - 'workspace', - ) - } - - while True: - - try: - - # Pulsar-specific setup (tenants, namespaces) - if backend_type == 'pulsar': - print(flush=True) - print( - f"Initialising Pulsar at {args.pulsar_admin_url}...", - flush=True, - ) - init_pulsar(args.pulsar_admin_url, args.tenant) - else: - print(flush=True) - print( - f"Using {backend_type} backend (no admin setup needed).", - flush=True, - ) - - # Push config (works with any backend) - push_config( - args.config, args.config_file, - workspace=args.workspace, - **pubsub_config, - ) - - print("Initialisation complete.", flush=True) - break - - except Exception as e: - - print("Exception:", e, flush=True) - - print("Sleeping...", flush=True) - time.sleep(2) - print("Will retry...", flush=True) - -if __name__ == "__main__": - main() diff --git a/trustgraph-cli/trustgraph/cli/list_api_keys.py b/trustgraph-cli/trustgraph/cli/list_api_keys.py new file mode 100644 index 00000000..f969890e --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/list_api_keys.py @@ -0,0 +1,69 @@ +""" +List the API keys for a user. +""" + +import argparse + +import tabulate + +from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main + + +def do_list_api_keys(args): + req = {"operation": "list-api-keys", "user_id": args.user_id} + if args.workspace: + req["workspace"] = args.workspace + resp = call_iam(args.api_url, args.token, req) + + keys = resp.get("api_keys", []) + if not keys: + print("No keys.") + return + + rows = [ + [ + k.get("id", ""), + k.get("name", ""), + k.get("prefix", ""), + k.get("created", ""), + k.get("last_used", "") or "—", + k.get("expires", "") or "never", + ] + for k in keys + ] + print(tabulate.tabulate( + rows, + headers=["id", "name", "prefix", "created", "last used", "expires"], + tablefmt="pretty", + stralign="left", + )) + + +def main(): + parser = argparse.ArgumentParser( + prog="tg-list-api-keys", description=__doc__, + ) + parser.add_argument( + "-u", "--api-url", default=DEFAULT_URL, + help=f"API URL (default: {DEFAULT_URL})", + ) + parser.add_argument( + "-t", "--token", default=DEFAULT_TOKEN, + help="Auth token (default: $TRUSTGRAPH_TOKEN)", + ) + parser.add_argument( + "--user-id", required=True, + help="Owner user id", + ) + parser.add_argument( + "-w", "--workspace", default=None, + help=( + "Target workspace (admin only; defaults to caller's " + "assigned workspace)" + ), + ) + run_main(do_list_api_keys, parser) + + +if __name__ == "__main__": + main() diff --git a/trustgraph-cli/trustgraph/cli/list_users.py b/trustgraph-cli/trustgraph/cli/list_users.py new file mode 100644 index 00000000..25bc1901 --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/list_users.py @@ -0,0 +1,65 @@ +""" +List users in the caller's workspace. +""" + +import argparse + +import tabulate + +from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main + + +def do_list_users(args): + req = {"operation": "list-users"} + if args.workspace: + req["workspace"] = args.workspace + resp = call_iam(args.api_url, args.token, req) + + users = resp.get("users", []) + if not users: + print("No users.") + return + + rows = [ + [ + u.get("id", ""), + u.get("username", ""), + u.get("name", ""), + ", ".join(u.get("roles", [])), + "yes" if u.get("enabled") else "no", + "yes" if u.get("must_change_password") else "no", + ] + for u in users + ] + print(tabulate.tabulate( + rows, + headers=["id", "username", "name", "roles", "enabled", "change-pw"], + tablefmt="pretty", + stralign="left", + )) + + +def main(): + parser = argparse.ArgumentParser( + prog="tg-list-users", description=__doc__, + ) + parser.add_argument( + "-u", "--api-url", default=DEFAULT_URL, + help=f"API URL (default: {DEFAULT_URL})", + ) + parser.add_argument( + "-t", "--token", default=DEFAULT_TOKEN, + help="Auth token (default: $TRUSTGRAPH_TOKEN)", + ) + parser.add_argument( + "-w", "--workspace", default=None, + help=( + "Target workspace (admin only; defaults to caller's " + "assigned workspace)" + ), + ) + run_main(do_list_users, parser) + + +if __name__ == "__main__": + main() diff --git a/trustgraph-cli/trustgraph/cli/list_workspaces.py b/trustgraph-cli/trustgraph/cli/list_workspaces.py new file mode 100644 index 00000000..170d330c --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/list_workspaces.py @@ -0,0 +1,53 @@ +""" +List workspaces (system-level; requires admin). +""" + +import argparse + +import tabulate + +from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main + + +def do_list_workspaces(args): + resp = call_iam( + args.api_url, args.token, {"operation": "list-workspaces"}, + ) + workspaces = resp.get("workspaces", []) + if not workspaces: + print("No workspaces.") + return + rows = [ + [ + w.get("id", ""), + w.get("name", ""), + "yes" if w.get("enabled") else "no", + w.get("created", ""), + ] + for w in workspaces + ] + print(tabulate.tabulate( + rows, + headers=["id", "name", "enabled", "created"], + tablefmt="pretty", + stralign="left", + )) + + +def main(): + parser = argparse.ArgumentParser( + prog="tg-list-workspaces", description=__doc__, + ) + parser.add_argument( + "-u", "--api-url", default=DEFAULT_URL, + help=f"API URL (default: {DEFAULT_URL})", + ) + parser.add_argument( + "-t", "--token", default=DEFAULT_TOKEN, + help="Auth token (default: $TRUSTGRAPH_TOKEN)", + ) + run_main(do_list_workspaces, parser) + + +if __name__ == "__main__": + main() diff --git a/trustgraph-cli/trustgraph/cli/login.py b/trustgraph-cli/trustgraph/cli/login.py new file mode 100644 index 00000000..0e87c3b0 --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/login.py @@ -0,0 +1,62 @@ +""" +Log in with username / password. Prints the resulting JWT to +stdout so it can be captured for subsequent CLI use. +""" + +import argparse +import getpass +import sys + +from ._iam import DEFAULT_URL, call_auth, run_main + + +def do_login(args): + password = args.password + if not password: + password = getpass.getpass(f"Password for {args.username}: ") + + body = { + "username": args.username, + "password": password, + } + if args.workspace: + body["workspace"] = args.workspace + + resp = call_auth(args.api_url, "/api/v1/auth/login", None, body) + + jwt = resp.get("jwt", "") + expires = resp.get("jwt_expires", "") + + if expires: + print(f"JWT expires: {expires}", file=sys.stderr) + # Machine-readable on stdout. + print(jwt) + + +def main(): + parser = argparse.ArgumentParser( + prog="tg-login", description=__doc__, + ) + parser.add_argument( + "-u", "--api-url", default=DEFAULT_URL, + help=f"API URL (default: {DEFAULT_URL})", + ) + parser.add_argument( + "--username", required=True, help="Username", + ) + parser.add_argument( + "--password", default=None, + help="Password (prompted if omitted)", + ) + parser.add_argument( + "-w", "--workspace", default=None, + help=( + "Optional workspace to log in against. Defaults to " + "the user's assigned workspace." + ), + ) + run_main(do_login, parser) + + +if __name__ == "__main__": + main() diff --git a/trustgraph-cli/trustgraph/cli/reset_password.py b/trustgraph-cli/trustgraph/cli/reset_password.py new file mode 100644 index 00000000..600f00e1 --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/reset_password.py @@ -0,0 +1,54 @@ +""" +Admin: reset another user's password. Prints a one-time temporary +password to stdout. The user is forced to change it on next login. +""" + +import argparse +import sys + +from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main + + +def do_reset_password(args): + req = {"operation": "reset-password", "user_id": args.user_id} + if args.workspace: + req["workspace"] = args.workspace + resp = call_iam(args.api_url, args.token, req) + + tmp = resp.get("temporary_password", "") + if not tmp: + raise RuntimeError( + "IAM returned no temporary password — unexpected" + ) + print("Temporary password (shown once, capture now):", file=sys.stderr) + print(tmp) + + +def main(): + parser = argparse.ArgumentParser( + prog="tg-reset-password", description=__doc__, + ) + parser.add_argument( + "-u", "--api-url", default=DEFAULT_URL, + help=f"API URL (default: {DEFAULT_URL})", + ) + parser.add_argument( + "-t", "--token", default=DEFAULT_TOKEN, + help="Auth token (default: $TRUSTGRAPH_TOKEN)", + ) + parser.add_argument( + "--user-id", required=True, + help="Target user id", + ) + parser.add_argument( + "-w", "--workspace", default=None, + help=( + "Target workspace (admin only; defaults to caller's " + "assigned workspace)" + ), + ) + run_main(do_reset_password, parser) + + +if __name__ == "__main__": + main() diff --git a/trustgraph-cli/trustgraph/cli/revoke_api_key.py b/trustgraph-cli/trustgraph/cli/revoke_api_key.py new file mode 100644 index 00000000..3976b56f --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/revoke_api_key.py @@ -0,0 +1,44 @@ +""" +Revoke an API key by id. +""" + +import argparse + +from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main + + +def do_revoke_api_key(args): + req = {"operation": "revoke-api-key", "key_id": args.key_id} + if args.workspace: + req["workspace"] = args.workspace + call_iam(args.api_url, args.token, req) + print(f"Revoked key {args.key_id}") + + +def main(): + parser = argparse.ArgumentParser( + prog="tg-revoke-api-key", description=__doc__, + ) + parser.add_argument( + "-u", "--api-url", default=DEFAULT_URL, + help=f"API URL (default: {DEFAULT_URL})", + ) + parser.add_argument( + "-t", "--token", default=DEFAULT_TOKEN, + help="Auth token (default: $TRUSTGRAPH_TOKEN)", + ) + parser.add_argument( + "--key-id", required=True, help="Key id to revoke", + ) + parser.add_argument( + "-w", "--workspace", default=None, + help=( + "Target workspace (admin only; defaults to caller's " + "assigned workspace)" + ), + ) + run_main(do_revoke_api_key, parser) + + +if __name__ == "__main__": + main() diff --git a/trustgraph-cli/trustgraph/cli/show_flow_state.py b/trustgraph-cli/trustgraph/cli/show_flow_state.py index 8fec04ec..3a733270 100644 --- a/trustgraph-cli/trustgraph/cli/show_flow_state.py +++ b/trustgraph-cli/trustgraph/cli/show_flow_state.py @@ -44,16 +44,18 @@ def show_processors(metrics_url, flow_label): obj = resp.json() - tbl = [ - [ - m["metric"]["job"], - "\U0001f49a" if int(m["value"][1]) > 0 else "\U0000274c" - ] - for m in obj["data"]["result"] - ] + # consumer_state is one sample per consumer (queue); a processor + # with N subscriptions shows up N times. Aggregate to one row per + # processor: green only if every consumer is running. + by_proc = {} + for m in obj["data"]["result"]: + name = m["metric"].get("processor", m["metric"]["job"]) + running = int(m["value"][1]) > 0 + by_proc[name] = by_proc.get(name, True) and running - for row in tbl: - print(f"- {row[0]:30} {row[1]}") + for name in sorted(by_proc): + icon = "\U0001f49a" if by_proc[name] else "\U0000274c" + print(f"- {name:30} {icon}") def main(): diff --git a/trustgraph-cli/trustgraph/cli/show_processor_state.py b/trustgraph-cli/trustgraph/cli/show_processor_state.py index b4ae4a16..9de05bc6 100644 --- a/trustgraph-cli/trustgraph/cli/show_processor_state.py +++ b/trustgraph-cli/trustgraph/cli/show_processor_state.py @@ -17,7 +17,7 @@ def dump_status(url): tbl = [ [ - m["metric"]["job"], + m["metric"].get("processor", m["metric"]["job"]), "\U0001f49a" ] for m in obj["data"]["result"] diff --git a/trustgraph-cli/trustgraph/cli/update_user.py b/trustgraph-cli/trustgraph/cli/update_user.py new file mode 100644 index 00000000..5c1dc4d7 --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/update_user.py @@ -0,0 +1,125 @@ +""" +Update a user's profile fields: name, email, roles, enabled flag, +must-change-password flag. + +Username is immutable — create a new user and disable the old one +to effect a username change. Password changes go through +``tg-change-password`` (self-service) or ``tg-reset-password`` +(admin-driven). + +Only the fields you supply are changed; omitted fields are left +untouched on the user record. An empty ``--roles`` is rejected by +iam-svc (a user must have at least one role); to demote a user use +``tg-disable-user``. +""" + +import argparse +import sys + +from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main + + +def _parse_bool(s): + if s is None: + return None + s = s.strip().lower() + if s in ("yes", "y", "true", "t", "1"): + return True + if s in ("no", "n", "false", "f", "0"): + return False + raise argparse.ArgumentTypeError( + f"expected yes/no, got {s!r}" + ) + + +def do_update_user(args): + user = {} + if args.name is not None: + user["name"] = args.name + if args.email is not None: + user["email"] = args.email + if args.roles is not None: + user["roles"] = args.roles + if args.enabled is not None: + user["enabled"] = args.enabled + if args.must_change_password is not None: + user["must_change_password"] = args.must_change_password + + if not user: + print( + "tg-update-user: nothing to change — supply at least " + "one of --name / --email / --roles / --enabled / " + "--must-change-password", + file=sys.stderr, + ) + sys.exit(2) + + req = { + "operation": "update-user", + "user_id": args.user_id, + "user": user, + } + if args.workspace: + req["workspace"] = args.workspace + resp = call_iam(args.api_url, args.token, req) + + rec = resp.get("user", {}) + print(f"id : {rec.get('id', '')}") + print(f"username : {rec.get('username', '')}") + print(f"name : {rec.get('name', '')}") + print(f"email : {rec.get('email', '')}") + print(f"workspace : {rec.get('workspace', '')}") + print(f"roles : {', '.join(rec.get('roles', []))}") + print(f"enabled : {'yes' if rec.get('enabled') else 'no'}") + print( + f"must-change-pw: " + f"{'yes' if rec.get('must_change_password') else 'no'}" + ) + + +def main(): + parser = argparse.ArgumentParser( + prog="tg-update-user", description=__doc__, + ) + parser.add_argument( + "-u", "--api-url", default=DEFAULT_URL, + help=f"API URL (default: {DEFAULT_URL})", + ) + parser.add_argument( + "-t", "--token", default=DEFAULT_TOKEN, + help="Auth token (default: $TRUSTGRAPH_TOKEN)", + ) + parser.add_argument( + "--user-id", required=True, help="Target user id", + ) + parser.add_argument( + "--name", default=None, help="New display name", + ) + parser.add_argument( + "--email", default=None, help="New email", + ) + parser.add_argument( + "--roles", nargs="+", default=None, + help="Replacement role list (e.g. --roles reader writer)", + ) + parser.add_argument( + "--enabled", type=_parse_bool, default=None, + help="Set enabled flag (yes/no)", + ) + parser.add_argument( + "--must-change-password", type=_parse_bool, default=None, + help="Set must-change-password flag (yes/no)", + ) + parser.add_argument( + "-w", "--workspace", default=None, + help=( + "Optional workspace integrity check — when supplied, " + "iam-svc verifies the target user's home workspace " + "matches" + ), + ) + run_main(do_update_user, parser) + + +if __name__ == "__main__": + main() diff --git a/trustgraph-cli/trustgraph/cli/whoami.py b/trustgraph-cli/trustgraph/cli/whoami.py new file mode 100644 index 00000000..1799685d --- /dev/null +++ b/trustgraph-cli/trustgraph/cli/whoami.py @@ -0,0 +1,52 @@ +""" +Show the authenticated caller's own user record. +""" + +import argparse + +import tabulate + +from ._iam import DEFAULT_URL, DEFAULT_TOKEN, call_iam, run_main + + +def do_whoami(args): + resp = call_iam(args.api_url, args.token, {"operation": "whoami"}) + user = resp.get("user") + if not user: + print("(no user record returned)") + return + + rows = [ + ["id", user.get("id", "")], + ["username", user.get("username", "")], + ["name", user.get("name", "")], + ["email", user.get("email", "")], + ["workspace", user.get("workspace", "")], + ["roles", ", ".join(user.get("roles", []))], + ["enabled", "yes" if user.get("enabled") else "no"], + [ + "must change password", + "yes" if user.get("must_change_password") else "no", + ], + ["created", user.get("created", "")], + ] + print(tabulate.tabulate(rows, tablefmt="plain")) + + +def main(): + parser = argparse.ArgumentParser( + prog="tg-whoami", description=__doc__, + ) + parser.add_argument( + "-u", "--api-url", default=DEFAULT_URL, + help=f"API URL (default: {DEFAULT_URL})", + ) + parser.add_argument( + "-t", "--token", default=DEFAULT_TOKEN, + help="Auth token (default: $TRUSTGRAPH_TOKEN)", + ) + run_main(do_whoami, parser) + + +if __name__ == "__main__": + main() diff --git a/trustgraph-flow/pyproject.toml b/trustgraph-flow/pyproject.toml index 8ba85adf..d8c690b5 100644 --- a/trustgraph-flow/pyproject.toml +++ b/trustgraph-flow/pyproject.toml @@ -60,8 +60,10 @@ agent-orchestrator = "trustgraph.agent.orchestrator:run" api-gateway = "trustgraph.gateway:run" chunker-recursive = "trustgraph.chunking.recursive:run" chunker-token = "trustgraph.chunking.token:run" +bootstrap = "trustgraph.bootstrap.bootstrapper:run" config-svc = "trustgraph.config.service:run" flow-svc = "trustgraph.flow.service:run" +iam-svc = "trustgraph.iam.service:run" doc-embeddings-query-milvus = "trustgraph.query.doc_embeddings.milvus:run" doc-embeddings-query-pinecone = "trustgraph.query.doc_embeddings.pinecone:run" doc-embeddings-query-qdrant = "trustgraph.query.doc_embeddings.qdrant:run" diff --git a/trustgraph-flow/trustgraph/bootstrap/__init__.py b/trustgraph-flow/trustgraph/bootstrap/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/trustgraph-flow/trustgraph/bootstrap/base.py b/trustgraph-flow/trustgraph/bootstrap/base.py new file mode 100644 index 00000000..cb022a16 --- /dev/null +++ b/trustgraph-flow/trustgraph/bootstrap/base.py @@ -0,0 +1,68 @@ +""" +Bootstrap framework: Initialiser base class and per-wake context. + +See docs/tech-specs/bootstrap.md for the full design. +""" + +import logging +from dataclasses import dataclass +from typing import Any + + +@dataclass +class InitContext: + """Shared per-wake context passed to each initialiser. + + The bootstrapper constructs one of these on every wake cycle, + tears it down at cycle end, and passes it into each initialiser's + ``run()`` method. Fields are short-lived and safe to use during + a single cycle only. + """ + + logger: logging.Logger + config: Any # ConfigClient + flow: Any # RequestResponse client for flow-svc + + +class Initialiser: + """Base class for bootstrap initialisers. + + Subclasses implement :meth:`run`. The bootstrapper manages + completion state, flag comparison, retry and error handling — + subclasses describe only the work to perform. + + Class attributes: + + * ``wait_for_services`` (bool, default ``True``): when ``True`` the + initialiser only runs after the bootstrapper's service gate has + passed (config-svc and flow-svc reachable). Set ``False`` for + initialisers that bring up infrastructure the gate itself + depends on — principally Pulsar topology, without which + config-svc cannot come online. + """ + + wait_for_services: bool = True + + def __init__(self, **params): + # Subclasses should consume their own params via keyword + # arguments in their own __init__ signatures. This catch-all + # is here so any kwargs that filter through unnoticed don't + # raise TypeError on construction. + pass + + async def run(self, ctx, old_flag, new_flag): + """Perform initialisation work. + + :param ctx: :class:`InitContext` with logger, config client, + flow-svc client. + :param old_flag: Previously-stored flag string, or ``None`` if + this initialiser has never successfully completed in this + deployment. + :param new_flag: Currently-configured flag. A string chosen + by the operator; typically something like ``"v1"``. + + :raises: Any exception on failure. The bootstrapper catches, + logs, and re-runs on the next cycle; completion state is + only written on clean return. + """ + raise NotImplementedError diff --git a/trustgraph-flow/trustgraph/bootstrap/bootstrapper/__init__.py b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/__init__.py new file mode 100644 index 00000000..98f4d9da --- /dev/null +++ b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/__init__.py @@ -0,0 +1 @@ +from . service import * diff --git a/trustgraph-flow/trustgraph/bootstrap/bootstrapper/__main__.py b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/__main__.py new file mode 100644 index 00000000..da5a9021 --- /dev/null +++ b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/__main__.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from . service import run + +if __name__ == '__main__': + run() diff --git a/trustgraph-flow/trustgraph/bootstrap/bootstrapper/service.py b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/service.py new file mode 100644 index 00000000..eb6238d3 --- /dev/null +++ b/trustgraph-flow/trustgraph/bootstrap/bootstrapper/service.py @@ -0,0 +1,414 @@ +""" +Bootstrapper processor. + +Runs a pluggable list of initialisers in a reconciliation loop. +Each initialiser's completion state is recorded in the reserved +``__system__`` workspace under the ``init-state`` config type. + +See docs/tech-specs/bootstrap.md for the full design. +""" + +import asyncio +import importlib +import json +import logging +import uuid +from argparse import ArgumentParser +from dataclasses import dataclass + +from trustgraph.base import AsyncProcessor +from trustgraph.base import ProducerMetrics, SubscriberMetrics +from trustgraph.base.config_client import ConfigClient +from trustgraph.base.request_response_spec import RequestResponse +from trustgraph.schema import ( + ConfigRequest, ConfigResponse, + config_request_queue, config_response_queue, +) +from trustgraph.schema import ( + FlowRequest, FlowResponse, + flow_request_queue, flow_response_queue, +) + +from .. base import Initialiser, InitContext + +logger = logging.getLogger(__name__) + +default_ident = "bootstrap" + +# Reserved workspace + config type under which completion state is +# stored. Reserved (`_`-prefix) workspaces are excluded from the +# config push broadcast — live processors never see these keys. +SYSTEM_WORKSPACE = "__system__" +INIT_STATE_TYPE = "init-state" + +# Cadence tiers. +GATE_BACKOFF = 5 # Services not responding; retry soon. +INIT_RETRY = 15 # Gate passed but something ran/failed; + # converge quickly. +STEADY_INTERVAL = 300 # Everything at target flag; idle cheaply. + + +@dataclass +class InitialiserSpec: + """One entry in the bootstrapper's configured list of initialisers.""" + name: str + flag: str + instance: Initialiser + + +def _resolve_class(dotted): + """Import and return a class by its dotted path.""" + module_path, _, class_name = dotted.rpartition(".") + if not module_path: + raise ValueError( + f"Initialiser class must be a dotted path, got {dotted!r}" + ) + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +def _load_initialisers_file(path): + """Load the initialisers spec list from a YAML or JSON file. + + File shape: + + .. code-block:: yaml + + initialisers: + - class: trustgraph.bootstrap.initialisers.PulsarTopology + name: pulsar-topology + flag: v1 + params: + admin_url: http://pulsar:8080 + tenant: tg + - ... + """ + with open(path) as f: + content = f.read() + if path.endswith((".yaml", ".yml")): + import yaml + doc = yaml.safe_load(content) + else: + doc = json.loads(content) + if not isinstance(doc, dict) or "initialisers" not in doc: + raise RuntimeError( + f"{path}: expected a mapping with an 'initialisers' key" + ) + return doc["initialisers"] + + +class Processor(AsyncProcessor): + + def __init__(self, **params): + + super().__init__(**params) + + # Source the initialisers list either from a direct parameter + # (processor-group embedding) or from a file (CLI launch). + inits = params.get("initialisers") + if inits is None: + inits_file = params.get("initialisers_file") + if inits_file is None: + raise RuntimeError( + "Bootstrapper requires either the 'initialisers' " + "parameter or --initialisers-file" + ) + inits = _load_initialisers_file(inits_file) + + self.specs = [] + names = set() + + for entry in inits: + if not isinstance(entry, dict): + raise RuntimeError( + f"Initialiser entry must be a mapping, got: {entry!r}" + ) + for required in ("class", "name", "flag"): + if required not in entry: + raise RuntimeError( + f"Initialiser entry missing required field " + f"{required!r}: {entry!r}" + ) + + name = entry["name"] + if name in names: + raise RuntimeError(f"Duplicate initialiser name {name!r}") + names.add(name) + + cls = _resolve_class(entry["class"]) + + try: + instance = cls(**entry.get("params", {})) + except Exception as e: + raise RuntimeError( + f"Failed to instantiate initialiser " + f"{entry['class']!r} as {name!r}: " + f"{type(e).__name__}: {e}" + ) + + self.specs.append(InitialiserSpec( + name=name, + flag=entry["flag"], + instance=instance, + )) + + logger.info( + f"Bootstrapper: loaded {len(self.specs)} initialisers" + ) + + # ------------------------------------------------------------------ + # Client construction (short-lived per wake cycle). + # ------------------------------------------------------------------ + + def _make_config_client(self): + rr_id = str(uuid.uuid4()) + return ConfigClient( + backend=self.pubsub_backend, + subscription=f"{self.id}--config--{rr_id}", + consumer_name=self.id, + request_topic=config_request_queue, + request_schema=ConfigRequest, + request_metrics=ProducerMetrics( + processor=self.id, flow=None, name="config-request", + ), + response_topic=config_response_queue, + response_schema=ConfigResponse, + response_metrics=SubscriberMetrics( + processor=self.id, flow=None, name="config-response", + ), + ) + + def _make_flow_client(self): + rr_id = str(uuid.uuid4()) + return RequestResponse( + backend=self.pubsub_backend, + subscription=f"{self.id}--flow--{rr_id}", + consumer_name=self.id, + request_topic=flow_request_queue, + request_schema=FlowRequest, + request_metrics=ProducerMetrics( + processor=self.id, flow=None, name="flow-request", + ), + response_topic=flow_response_queue, + response_schema=FlowResponse, + response_metrics=SubscriberMetrics( + processor=self.id, flow=None, name="flow-response", + ), + ) + + async def _open_clients(self): + config = self._make_config_client() + flow = self._make_flow_client() + await config.start() + try: + await flow.start() + except Exception: + await self._safe_stop(config) + raise + return config, flow + + async def _safe_stop(self, client): + try: + await client.stop() + except Exception: + pass + + # ------------------------------------------------------------------ + # Service gate. + # ------------------------------------------------------------------ + + async def _gate_ready(self, config, flow): + try: + await config.keys(SYSTEM_WORKSPACE, INIT_STATE_TYPE) + except Exception as e: + logger.info( + f"Gate: config-svc not ready ({type(e).__name__}: {e})" + ) + return False + + try: + resp = await flow.request( + FlowRequest( + operation="list-blueprints", + workspace=SYSTEM_WORKSPACE, + ), + timeout=5, + ) + if resp.error: + logger.info( + f"Gate: flow-svc error: " + f"{resp.error.type}: {resp.error.message}" + ) + return False + except Exception as e: + logger.info( + f"Gate: flow-svc not ready ({type(e).__name__}: {e})" + ) + return False + + return True + + # ------------------------------------------------------------------ + # Completion state. + # ------------------------------------------------------------------ + + async def _stored_flag(self, config, name): + raw = await config.get(SYSTEM_WORKSPACE, INIT_STATE_TYPE, name) + if raw is None: + return None + try: + return json.loads(raw) + except Exception: + return raw + + async def _store_flag(self, config, name, flag): + await config.put( + SYSTEM_WORKSPACE, INIT_STATE_TYPE, name, + json.dumps(flag), + ) + + # ------------------------------------------------------------------ + # Per-spec execution. + # ------------------------------------------------------------------ + + async def _run_spec(self, spec, config, flow): + """Run a single initialiser spec. + + Returns one of: + - ``"skip"``: stored flag already matches target, nothing to do. + - ``"ran"``: initialiser ran and completion state was updated. + - ``"failed"``: initialiser raised. + - ``"failed-state-write"``: initialiser succeeded but we could + not persist the new flag (transient — will re-run next cycle). + """ + + try: + old_flag = await self._stored_flag(config, spec.name) + except Exception as e: + logger.warning( + f"{spec.name}: could not read stored flag " + f"({type(e).__name__}: {e})" + ) + return "failed" + + if old_flag == spec.flag: + return "skip" + + child_logger = logger.getChild(spec.name) + child_ctx = InitContext( + logger=child_logger, + config=config, + flow=flow, + ) + + child_logger.info( + f"Running (old_flag={old_flag!r} -> new_flag={spec.flag!r})" + ) + + try: + await spec.instance.run(child_ctx, old_flag, spec.flag) + except Exception as e: + child_logger.error( + f"Failed: {type(e).__name__}: {e}", exc_info=True, + ) + return "failed" + + try: + await self._store_flag(config, spec.name, spec.flag) + except Exception as e: + child_logger.warning( + f"Completed but could not persist state flag " + f"({type(e).__name__}: {e}); will re-run next cycle" + ) + return "failed-state-write" + + child_logger.info(f"Completed (flag={spec.flag!r})") + return "ran" + + # ------------------------------------------------------------------ + # Main loop. + # ------------------------------------------------------------------ + + async def run(self): + + logger.info( + f"Bootstrapper starting with {len(self.specs)} initialisers" + ) + + while self.running: + + sleep_for = STEADY_INTERVAL + + try: + config, flow = await self._open_clients() + except Exception as e: + logger.info( + f"Failed to open clients " + f"({type(e).__name__}: {e}); retry in {GATE_BACKOFF}s" + ) + await asyncio.sleep(GATE_BACKOFF) + continue + + try: + # Phase 1: pre-service initialisers run unconditionally. + pre_specs = [ + s for s in self.specs + if not s.instance.wait_for_services + ] + pre_results = {} + for spec in pre_specs: + pre_results[spec.name] = await self._run_spec( + spec, config, flow, + ) + + # Phase 2: gate. + gate_ok = await self._gate_ready(config, flow) + + # Phase 3: post-service initialisers, if gate passed. + post_results = {} + if gate_ok: + post_specs = [ + s for s in self.specs + if s.instance.wait_for_services + ] + for spec in post_specs: + post_results[spec.name] = await self._run_spec( + spec, config, flow, + ) + + # Cadence selection. + if not gate_ok: + sleep_for = GATE_BACKOFF + else: + all_results = {**pre_results, **post_results} + if any(r != "skip" for r in all_results.values()): + sleep_for = INIT_RETRY + else: + sleep_for = STEADY_INTERVAL + + finally: + await self._safe_stop(config) + await self._safe_stop(flow) + + await asyncio.sleep(sleep_for) + + # ------------------------------------------------------------------ + # CLI arg plumbing. + # ------------------------------------------------------------------ + + @staticmethod + def add_args(parser: ArgumentParser) -> None: + + AsyncProcessor.add_args(parser) + + parser.add_argument( + '-c', '--initialisers-file', + help='Path to YAML or JSON file describing the ' + 'initialisers to run. Ignored when the ' + "'initialisers' parameter is provided directly " + '(e.g. when running inside a processor group).', + ) + + +def run(): + Processor.launch(default_ident, __doc__) diff --git a/trustgraph-flow/trustgraph/bootstrap/initialisers/__init__.py b/trustgraph-flow/trustgraph/bootstrap/initialisers/__init__.py new file mode 100644 index 00000000..6171eb02 --- /dev/null +++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/__init__.py @@ -0,0 +1,20 @@ +""" +Core bootstrap initialisers. + +These cover the base TrustGraph deployment case. Enterprise or +third-party initialisers live in their own packages and are +referenced in the bootstrapper's config by fully-qualified dotted +path. +""" + +from . pulsar_topology import PulsarTopology +from . template_seed import TemplateSeed +from . workspace_init import WorkspaceInit +from . default_flow_start import DefaultFlowStart + +__all__ = [ + "PulsarTopology", + "TemplateSeed", + "WorkspaceInit", + "DefaultFlowStart", +] diff --git a/trustgraph-flow/trustgraph/bootstrap/initialisers/default_flow_start.py b/trustgraph-flow/trustgraph/bootstrap/initialisers/default_flow_start.py new file mode 100644 index 00000000..7e7f96bd --- /dev/null +++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/default_flow_start.py @@ -0,0 +1,101 @@ +""" +DefaultFlowStart initialiser — starts a named flow in a workspace +using a specified blueprint. + +Separated from WorkspaceInit so deployments that want a workspace +without an auto-started flow can simply omit this initialiser. + +Parameters +---------- +workspace : str (default "default") + Workspace in which to start the flow. +flow_id : str (default "default") + Identifier for the started flow. +blueprint : str (required) + Blueprint name (must already exist in the workspace's config, + typically via TemplateSeed -> WorkspaceInit). +description : str (default "Default") + Human-readable description passed to flow-svc. +parameters : dict (optional) + Optional parameter overrides passed to start-flow. +""" + +from trustgraph.schema import FlowRequest + +from .. base import Initialiser + + +class DefaultFlowStart(Initialiser): + + def __init__( + self, + workspace="default", + flow_id="default", + blueprint=None, + description="Default", + parameters=None, + **kwargs, + ): + super().__init__(**kwargs) + if not blueprint: + raise ValueError( + "DefaultFlowStart requires 'blueprint'" + ) + self.workspace = workspace + self.flow_id = flow_id + self.blueprint = blueprint + self.description = description + self.parameters = dict(parameters) if parameters else {} + + async def run(self, ctx, old_flag, new_flag): + + # Check whether the flow already exists. Belt-and-braces + # beyond the flag gate: if an operator stops and restarts the + # bootstrapper after the flow is already running, we don't + # want to blindly try to start it again. + list_resp = await ctx.flow.request( + FlowRequest( + operation="list-flows", + workspace=self.workspace, + ), + timeout=10, + ) + if list_resp.error: + raise RuntimeError( + f"list-flows failed: " + f"{list_resp.error.type}: {list_resp.error.message}" + ) + + if self.flow_id in (list_resp.flow_ids or []): + ctx.logger.info( + f"Flow {self.flow_id!r} already running in workspace " + f"{self.workspace!r}; nothing to do" + ) + return + + ctx.logger.info( + f"Starting flow {self.flow_id!r} " + f"(blueprint={self.blueprint!r}) " + f"in workspace {self.workspace!r}" + ) + + resp = await ctx.flow.request( + FlowRequest( + operation="start-flow", + workspace=self.workspace, + flow_id=self.flow_id, + blueprint_name=self.blueprint, + description=self.description, + parameters=self.parameters, + ), + timeout=30, + ) + if resp.error: + raise RuntimeError( + f"start-flow failed: " + f"{resp.error.type}: {resp.error.message}" + ) + + ctx.logger.info( + f"Flow {self.flow_id!r} started" + ) diff --git a/trustgraph-flow/trustgraph/bootstrap/initialisers/pulsar_topology.py b/trustgraph-flow/trustgraph/bootstrap/initialisers/pulsar_topology.py new file mode 100644 index 00000000..843fe056 --- /dev/null +++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/pulsar_topology.py @@ -0,0 +1,131 @@ +""" +PulsarTopology initialiser — creates Pulsar tenant and namespaces +with their retention policies. + +Runs pre-gate (``wait_for_services = False``) because config-svc and +flow-svc can't connect to Pulsar until these namespaces exist. +Admin-API calls are idempotent so re-runs on flag change are safe. +""" + +import asyncio +import requests + +from .. base import Initialiser + +# Namespace configs. flow/request take broker defaults. response +# and notify get aggressive retention — those classes carry short-lived +# request/response and notification traffic only. +NAMESPACE_CONFIG = { + "flow": {}, + "request": {}, + "response": { + "retention_policies": { + "retentionSizeInMB": -1, + "retentionTimeInMinutes": 3, + "subscriptionExpirationTimeMinutes": 30, + }, + }, + "notify": { + "retention_policies": { + "retentionSizeInMB": -1, + "retentionTimeInMinutes": 3, + "subscriptionExpirationTimeMinutes": 5, + }, + }, +} + +REQUEST_TIMEOUT = 10 + + +class PulsarTopology(Initialiser): + + wait_for_services = False + + def __init__( + self, + admin_url="http://pulsar:8080", + tenant="tg", + **kwargs, + ): + super().__init__(**kwargs) + self.admin_url = admin_url.rstrip("/") + self.tenant = tenant + + async def run(self, ctx, old_flag, new_flag): + # requests is blocking; offload to executor so the loop stays + # responsive. + loop = asyncio.get_event_loop() + await loop.run_in_executor(None, self._reconcile_sync, ctx.logger) + + # ------------------------------------------------------------------ + # Sync admin-API calls. + # ------------------------------------------------------------------ + + def _get_clusters(self): + resp = requests.get( + f"{self.admin_url}/admin/v2/clusters", + timeout=REQUEST_TIMEOUT, + ) + resp.raise_for_status() + return resp.json() + + def _tenant_exists(self): + resp = requests.get( + f"{self.admin_url}/admin/v2/tenants/{self.tenant}", + timeout=REQUEST_TIMEOUT, + ) + return resp.status_code == 200 + + def _create_tenant(self, clusters): + resp = requests.put( + f"{self.admin_url}/admin/v2/tenants/{self.tenant}", + json={"adminRoles": [], "allowedClusters": clusters}, + timeout=REQUEST_TIMEOUT, + ) + if resp.status_code != 204: + raise RuntimeError( + f"Tenant {self.tenant!r} create failed: " + f"{resp.status_code} {resp.text}" + ) + + def _namespace_exists(self, namespace): + resp = requests.get( + f"{self.admin_url}/admin/v2/namespaces/" + f"{self.tenant}/{namespace}", + timeout=REQUEST_TIMEOUT, + ) + return resp.status_code == 200 + + def _create_namespace(self, namespace, config): + resp = requests.put( + f"{self.admin_url}/admin/v2/namespaces/" + f"{self.tenant}/{namespace}", + json=config, + timeout=REQUEST_TIMEOUT, + ) + if resp.status_code != 204: + raise RuntimeError( + f"Namespace {self.tenant}/{namespace} create failed: " + f"{resp.status_code} {resp.text}" + ) + + def _reconcile_sync(self, logger): + if not self._tenant_exists(): + clusters = self._get_clusters() + logger.info( + f"Creating tenant {self.tenant!r} with clusters {clusters}" + ) + self._create_tenant(clusters) + else: + logger.debug(f"Tenant {self.tenant!r} already exists") + + for namespace, config in NAMESPACE_CONFIG.items(): + if self._namespace_exists(namespace): + logger.debug( + f"Namespace {self.tenant}/{namespace} already exists" + ) + continue + logger.info( + f"Creating namespace {self.tenant}/{namespace}" + ) + self._create_namespace(namespace, config) diff --git a/trustgraph-flow/trustgraph/bootstrap/initialisers/template_seed.py b/trustgraph-flow/trustgraph/bootstrap/initialisers/template_seed.py new file mode 100644 index 00000000..5f1e4c19 --- /dev/null +++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/template_seed.py @@ -0,0 +1,93 @@ +""" +TemplateSeed initialiser — populates the reserved ``__template__`` +workspace from an external JSON seed file. + +Seed file shape: + +.. code-block:: json + + { + "flow-blueprint": { + "ontology": { ... }, + "agent": { ... } + }, + "prompt": { + ... + }, + ... + } + +Top-level keys are config types; nested keys are config entries. +Values are arbitrary JSON (they'll be ``json.dumps()``'d on write). + +Parameters +---------- +config_file : str + Path to the seed file on disk. +overwrite : bool (default False) + On re-run (flag change), if True overwrite all keys; if False + upsert-missing-only (preserves any operator customisation of + the template). +""" + +import json + +from .. base import Initialiser + +TEMPLATE_WORKSPACE = "__template__" + + +class TemplateSeed(Initialiser): + + def __init__(self, config_file, overwrite=False, **kwargs): + super().__init__(**kwargs) + if not config_file: + raise ValueError("TemplateSeed requires 'config_file'") + self.config_file = config_file + self.overwrite = overwrite + + async def run(self, ctx, old_flag, new_flag): + + with open(self.config_file) as f: + seed = json.load(f) + + if old_flag is None: + # Clean first run — write every entry. + await self._write_all(ctx, seed) + return + + # Re-run after flag change. + if self.overwrite: + await self._write_all(ctx, seed) + else: + await self._upsert_missing(ctx, seed) + + async def _write_all(self, ctx, seed): + values = [] + for type_name, entries in seed.items(): + for key, value in entries.items(): + values.append((type_name, key, json.dumps(value))) + if values: + await ctx.config.put_many(TEMPLATE_WORKSPACE, values) + ctx.logger.info( + f"Template seeded with {len(values)} entries" + ) + + async def _upsert_missing(self, ctx, seed): + written = 0 + for type_name, entries in seed.items(): + existing = set( + await ctx.config.keys(TEMPLATE_WORKSPACE, type_name) + ) + values = [] + for key, value in entries.items(): + if key not in existing: + values.append( + (type_name, key, json.dumps(value)) + ) + if values: + await ctx.config.put_many(TEMPLATE_WORKSPACE, values) + written += len(values) + ctx.logger.info( + f"Template upsert-missing: {written} new entries" + ) diff --git a/trustgraph-flow/trustgraph/bootstrap/initialisers/workspace_init.py b/trustgraph-flow/trustgraph/bootstrap/initialisers/workspace_init.py new file mode 100644 index 00000000..10aefe9d --- /dev/null +++ b/trustgraph-flow/trustgraph/bootstrap/initialisers/workspace_init.py @@ -0,0 +1,138 @@ +""" +WorkspaceInit initialiser — creates a workspace and populates it from +either the ``__template__`` workspace or a seed file on disk. + +Parameters +---------- +workspace : str + Target workspace to create / populate. +source : str + Either ``"template"`` (copy the full contents of the + ``__template__`` workspace) or ``"seed-file"`` (read from + ``seed_file``). +seed_file : str (required when source=="seed-file") + Path to a JSON seed file with the same shape TemplateSeed consumes. +overwrite : bool (default False) + On re-run (flag change), if True overwrite all keys; if False, + upsert-missing-only (preserves in-workspace customisations). + +Raises (in ``run``) +------------------- +When source is ``"template"``, raises ``RuntimeError`` if the +``__template__`` workspace is empty — indicating that TemplateSeed +hasn't run yet. The bootstrapper's retry loop will re-attempt on +the next cycle once the prerequisite is satisfied. +""" + +import json + +from .. base import Initialiser + +TEMPLATE_WORKSPACE = "__template__" + + +class WorkspaceInit(Initialiser): + + def __init__( + self, + workspace="default", + source="template", + seed_file=None, + overwrite=False, + **kwargs, + ): + super().__init__(**kwargs) + + if source not in ("template", "seed-file"): + raise ValueError( + f"WorkspaceInit: source must be 'template' or " + f"'seed-file', got {source!r}" + ) + if source == "seed-file" and not seed_file: + raise ValueError( + "WorkspaceInit: seed_file required when source='seed-file'" + ) + + self.workspace = workspace + self.source = source + self.seed_file = seed_file + self.overwrite = overwrite + + async def run(self, ctx, old_flag, new_flag): + if self.source == "seed-file": + tree = self._load_seed_file() + else: + tree = await self._load_from_template(ctx) + + if old_flag is None or self.overwrite: + await self._write_all(ctx, tree) + else: + await self._upsert_missing(ctx, tree) + + def _load_seed_file(self): + with open(self.seed_file) as f: + return json.load(f) + + async def _load_from_template(self, ctx): + """Build a seed tree from the entire ``__template__`` workspace. + Raises if the workspace is empty, so the bootstrapper knows + the prerequisite isn't met yet.""" + + raw_tree = await ctx.config.get_all(TEMPLATE_WORKSPACE) + + tree = {} + total = 0 + for type_name, entries in raw_tree.items(): + parsed = {} + for key, raw in entries.items(): + if raw is None: + continue + try: + parsed[key] = json.loads(raw) + except Exception: + parsed[key] = raw + total += 1 + if parsed: + tree[type_name] = parsed + + if total == 0: + raise RuntimeError( + "Template workspace is empty — has TemplateSeed run yet?" + ) + + ctx.logger.debug( + f"Loaded {total} template entries across {len(tree)} types" + ) + return tree + + async def _write_all(self, ctx, tree): + values = [] + for type_name, entries in tree.items(): + for key, value in entries.items(): + values.append((type_name, key, json.dumps(value))) + if values: + await ctx.config.put_many(self.workspace, values) + ctx.logger.info( + f"Workspace {self.workspace!r} populated with " + f"{len(values)} entries" + ) + + async def _upsert_missing(self, ctx, tree): + written = 0 + for type_name, entries in tree.items(): + existing = set( + await ctx.config.keys(self.workspace, type_name) + ) + values = [] + for key, value in entries.items(): + if key not in existing: + values.append( + (type_name, key, json.dumps(value)) + ) + if values: + await ctx.config.put_many(self.workspace, values) + written += len(values) + ctx.logger.info( + f"Workspace {self.workspace!r} upsert-missing: " + f"{written} new entries" + ) diff --git a/trustgraph-flow/trustgraph/config/service/service.py b/trustgraph-flow/trustgraph/config/service/service.py index 56a54ee0..058f4e4b 100644 --- a/trustgraph-flow/trustgraph/config/service/service.py +++ b/trustgraph-flow/trustgraph/config/service/service.py @@ -24,6 +24,21 @@ logger = logging.getLogger(__name__) default_ident = "config-svc" + +def is_reserved_workspace(workspace): + """Reserved workspaces are storage-only. + + Any workspace id beginning with ``_`` is reserved for internal use + (e.g. ``__template__`` holding factory-default seed config). + Reads and writes work normally so bootstrap and provisioning code + can use the standard config API, but **change notifications for + reserved workspaces are suppressed**. Services subscribed to the + config push therefore never see reserved-workspace events and + cannot accidentally act on template content as if it were live + state. + """ + return workspace.startswith("_") + default_config_request_queue = config_request_queue default_config_response_queue = config_response_queue default_config_push_queue = config_push_queue @@ -130,6 +145,21 @@ class Processor(AsyncProcessor): async def push(self, changes=None): + # Suppress notifications from reserved workspaces (ids starting + # with "_", e.g. "__template__"). Stored config is preserved; + # only the broadcast is filtered. Keeps services oblivious to + # template / bootstrap state. + if changes: + filtered = {} + for type_name, workspaces in changes.items(): + visible = [ + w for w in workspaces + if not is_reserved_workspace(w) + ] + if visible: + filtered[type_name] = visible + changes = filtered + version = await self.config.get_version() resp = ConfigPush( diff --git a/trustgraph-flow/trustgraph/embeddings/ollama/processor.py b/trustgraph-flow/trustgraph/embeddings/ollama/processor.py index c63db33c..5fa74054 100755 --- a/trustgraph-flow/trustgraph/embeddings/ollama/processor.py +++ b/trustgraph-flow/trustgraph/embeddings/ollama/processor.py @@ -5,7 +5,7 @@ Input is text, output is embeddings vector. """ from ... base import EmbeddingsService -from ollama import Client +from ollama import AsyncClient import os import logging @@ -30,24 +30,24 @@ class Processor(EmbeddingsService): } ) - self.client = Client(host=ollama) + self.client = AsyncClient(host=ollama) self.default_model = model self._checked_models = set() - def _ensure_model(self, model_name): + async def _ensure_model(self, model_name): """Check if model exists locally, pull it if not.""" if model_name in self._checked_models: return try: - self.client.show(model_name) + await self.client.show(model_name) self._checked_models.add(model_name) except Exception as e: status_code = getattr(e, 'status_code', None) if status_code == 404 or "not found" in str(e).lower(): logger.info(f"Ollama model '{model_name}' not found locally. Pulling, this may take a while...") try: - self.client.pull(model_name) + await self.client.pull(model_name) self._checked_models.add(model_name) logger.info(f"Successfully pulled Ollama model '{model_name}'.") except Exception as pull_e: @@ -63,10 +63,10 @@ class Processor(EmbeddingsService): use_model = model or self.default_model # Ensure the model exists/is pulled - self._ensure_model(use_model) + await self._ensure_model(use_model) # Ollama handles batch input efficiently - embeds = self.client.embed( + embeds = await self.client.embed( model = use_model, input = texts ) diff --git a/trustgraph-flow/trustgraph/gateway/auth.py b/trustgraph-flow/trustgraph/gateway/auth.py index a693ca32..6abcbe15 100644 --- a/trustgraph-flow/trustgraph/gateway/auth.py +++ b/trustgraph-flow/trustgraph/gateway/auth.py @@ -1,22 +1,371 @@ +""" +IAM-backed authentication and authorisation for the API gateway. -class Authenticator: +The gateway delegates both authentication ("who is this caller?") +and authorisation ("may they do this?") to the IAM regime via the +contract specified in docs/tech-specs/iam-contract.md. No regime- +specific policy (roles, scopes, claims) lives in the gateway. - def __init__(self, token=None, allow_all=False): +- Authentication: API keys are resolved by IAM; JWTs are validated + locally against the cached signing public key. +- Authorisation: every per-request decision is asked of IAM via + ``authorise(identity, capability, resource, parameters)``, with + results cached for the TTL the regime returns. +""" - if not allow_all and token is None: - raise RuntimeError("Need a token") +import asyncio +import base64 +import hashlib +import json +import logging +import time +import uuid +from dataclasses import dataclass, field - if not allow_all and token == "": - raise RuntimeError("Need a token") +from aiohttp import web - self.token = token - self.allow_all = allow_all +from cryptography.hazmat.primitives import serialization +from cryptography.hazmat.primitives.asymmetric import ed25519 - def permitted(self, token, roles): +from ..base.iam_client import IamClient +from ..base.metrics import ProducerMetrics, SubscriberMetrics +from ..schema import ( + IamRequest, IamResponse, + iam_request_queue, iam_response_queue, +) - if self.allow_all: return True +logger = logging.getLogger("auth") - if self.token != token: return False +API_KEY_CACHE_TTL = 60 # seconds - return True +# Upper bound on cache TTL the gateway honours for an authorisation +# decision, regardless of what the regime suggested. Caps the +# revocation latency window. +AUTHZ_CACHE_TTL_MAX = 60 # seconds + +@dataclass +class Identity: + """The gateway-side surface of an authenticated caller. + + Per the IAM contract this is a small fixed shape; regime-internal + state (roles, claims, group memberships) is reachable only via + the regime's ``authorise`` operation. The gateway itself never + reads policy from this object. + """ + # Opaque handle, quoted back when calling ``authorise``. For + # the OSS regime this is the user record's id; the gateway + # treats it as a string with no semantic content. + handle: str + # The workspace this credential authenticates to. Used by the + # gateway as the default-fill-in for operations that omit a + # workspace. Never used as policy input. + workspace: str + # Stable identifier for audit logs. In OSS this is the same + # value as ``handle``; not assumed equal in the contract. + principal_id: str + # How the credential was presented. Non-policy; useful for + # logs / metrics only. + source: str # "api-key" | "jwt" + + +def _auth_failure(): + return web.HTTPUnauthorized( + text='{"error":"auth failure"}', + content_type="application/json", + ) + + +def _access_denied(): + return web.HTTPForbidden( + text='{"error":"access denied"}', + content_type="application/json", + ) + + +def _b64url_decode(s): + pad = "=" * (-len(s) % 4) + return base64.urlsafe_b64decode(s + pad) + + +def _verify_jwt_eddsa(token, public_pem): + """Verify an Ed25519 JWT and return its claims. Raises on any + validation failure. Refuses non-EdDSA algorithms.""" + parts = token.split(".") + if len(parts) != 3: + raise ValueError("malformed JWT") + h_b64, p_b64, s_b64 = parts + signing_input = f"{h_b64}.{p_b64}".encode("ascii") + header = json.loads(_b64url_decode(h_b64)) + if header.get("alg") != "EdDSA": + raise ValueError(f"unsupported alg: {header.get('alg')!r}") + + key = serialization.load_pem_public_key(public_pem.encode("ascii")) + if not isinstance(key, ed25519.Ed25519PublicKey): + raise ValueError("public key is not Ed25519") + + signature = _b64url_decode(s_b64) + key.verify(signature, signing_input) # raises InvalidSignature + + claims = json.loads(_b64url_decode(p_b64)) + exp = claims.get("exp") + if exp is None or exp < time.time(): + raise ValueError("expired") + return claims + + +class IamAuth: + """Resolves bearer credentials via the IAM service. + + Used by every gateway endpoint that needs authentication. Fetches + the IAM signing public key at startup (cached in memory). API + keys are resolved via the IAM service with a local hash→identity + cache (short TTL so revoked keys stop working within the TTL + window without any push mechanism).""" + + def __init__(self, backend, id="api-gateway"): + self.backend = backend + self.id = id + + # Populated at start() via IAM. + self._signing_public_pem = None + + # API-key cache: plaintext_sha256_hex -> (Identity, expires_ts) + self._key_cache = {} + self._key_cache_lock = asyncio.Lock() + + # Authorisation decision cache: hash(handle, capability, + # resource, parameters) -> (allow_bool, expires_ts). Holds + # both allows and denies — denies cached briefly to avoid + # hammering iam-svc with repeated rejected attempts. + self._authz_cache: dict[str, tuple[bool, float]] = {} + self._authz_cache_lock = asyncio.Lock() + + # ------------------------------------------------------------------ + # Short-lived client helper. Mirrors the pattern used by the + # bootstrap framework and AsyncProcessor: a fresh uuid suffix per + # invocation so Pulsar exclusive subscriptions don't collide with + # ghosts from prior calls. + # ------------------------------------------------------------------ + + def _make_client(self): + rr_id = str(uuid.uuid4()) + return IamClient( + backend=self.backend, + subscription=f"{self.id}--iam--{rr_id}", + consumer_name=self.id, + request_topic=iam_request_queue, + request_schema=IamRequest, + request_metrics=ProducerMetrics( + processor=self.id, flow=None, name="iam-request", + ), + response_topic=iam_response_queue, + response_schema=IamResponse, + response_metrics=SubscriberMetrics( + processor=self.id, flow=None, name="iam-response", + ), + ) + + async def _with_client(self, op): + """Open a short-lived IamClient, run ``op(client)``, close.""" + client = self._make_client() + await client.start() + try: + return await op(client) + finally: + try: + await client.stop() + except Exception: + pass + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + async def start(self, max_retries=30, retry_delay=2.0): + """Fetch the signing public key from IAM. Retries on + failure — the gateway may be starting before IAM is ready.""" + + async def _fetch(client): + return await client.get_signing_key_public() + + for attempt in range(max_retries): + try: + pem = await self._with_client(_fetch) + if pem: + self._signing_public_pem = pem + logger.info( + "IamAuth: fetched IAM signing public key " + f"({len(pem)} bytes)" + ) + return + except Exception as e: + logger.info( + f"IamAuth: waiting for IAM signing key " + f"({type(e).__name__}: {e}); " + f"retry {attempt + 1}/{max_retries}" + ) + await asyncio.sleep(retry_delay) + + # Don't prevent startup forever. A later authenticate() call + # will try again via the JWT path. + logger.warning( + "IamAuth: could not fetch IAM signing key at startup; " + "JWT validation will fail until it's available" + ) + + # ------------------------------------------------------------------ + # Authentication + # ------------------------------------------------------------------ + + async def authenticate(self, request): + """Extract and validate the Bearer credential from an HTTP + request. Returns an ``Identity``. Raises HTTPUnauthorized + (401 / "auth failure") on any failure mode — the caller + cannot distinguish missing / malformed / invalid / expired / + revoked credentials.""" + + header = request.headers.get("Authorization", "") + if not header.startswith("Bearer "): + raise _auth_failure() + token = header[len("Bearer "):].strip() + if not token: + raise _auth_failure() + + # API keys always start with "tg_". JWTs have two dots and + # no "tg_" prefix. Discriminate cheaply. + if token.startswith("tg_"): + return await self._resolve_api_key(token) + if token.count(".") == 2: + return self._verify_jwt(token) + raise _auth_failure() + + def _verify_jwt(self, token): + if not self._signing_public_pem: + raise _auth_failure() + try: + claims = _verify_jwt_eddsa(token, self._signing_public_pem) + except Exception as e: + logger.debug(f"JWT validation failed: {type(e).__name__}: {e}") + raise _auth_failure() + + sub = claims.get("sub", "") + ws = claims.get("workspace", "") + if not sub or not ws: + raise _auth_failure() + + # JWT carries no policy state under the IAM contract; + # any roles / claims field is ignored here. + return Identity( + handle=sub, workspace=ws, principal_id=sub, source="jwt", + ) + + async def _resolve_api_key(self, plaintext): + h = hashlib.sha256(plaintext.encode("utf-8")).hexdigest() + + cached = self._key_cache.get(h) + now = time.time() + if cached and cached[1] > now: + return cached[0] + + async with self._key_cache_lock: + cached = self._key_cache.get(h) + if cached and cached[1] > now: + return cached[0] + + try: + async def _call(client): + return await client.resolve_api_key(plaintext) + # ``roles`` is returned by the OSS regime as a hint + # but is not consulted by the gateway; all policy + # decisions go through ``authorise``. + user_id, workspace, _roles = await self._with_client(_call) + except Exception as e: + logger.debug( + f"API key resolution failed: " + f"{type(e).__name__}: {e}" + ) + raise _auth_failure() + + if not user_id or not workspace: + raise _auth_failure() + + identity = Identity( + handle=user_id, workspace=workspace, + principal_id=user_id, source="api-key", + ) + self._key_cache[h] = (identity, now + API_KEY_CACHE_TTL) + return identity + + # ------------------------------------------------------------------ + # Authorisation + # ------------------------------------------------------------------ + + @staticmethod + def _authz_cache_key(handle, capability, resource, parameters): + payload = json.dumps( + { + "h": handle, + "c": capability, + "r": resource or {}, + "p": parameters or {}, + }, + sort_keys=True, + separators=(",", ":"), + ) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + async def authorise(self, identity, capability, resource, parameters): + """Ask the IAM regime whether ``identity`` may perform + ``capability`` on ``resource`` given ``parameters``. + + Caches the decision for the regime's suggested TTL, clamped + above by ``AUTHZ_CACHE_TTL_MAX``. Both allow and deny + decisions are cached (denies briefly, to avoid hammering + iam-svc with repeated rejected attempts). + + Raises ``HTTPForbidden`` (403 / "access denied") on a deny + decision. Raises ``HTTPUnauthorized`` (401 / "auth failure") + if the IAM service errors out — failing closed.""" + + key = self._authz_cache_key( + identity.handle, capability, resource, parameters, + ) + now = time.time() + + cached = self._authz_cache.get(key) + if cached and cached[1] > now: + allow, _ = cached + if not allow: + raise _access_denied() + return + + async with self._authz_cache_lock: + cached = self._authz_cache.get(key) + if cached and cached[1] > now: + allow, _ = cached + if not allow: + raise _access_denied() + return + + try: + async def _call(client): + return await client.authorise( + identity.handle, capability, + resource or {}, parameters or {}, + ) + allow, ttl = await self._with_client(_call) + except Exception as e: + logger.warning( + f"authorise failed: {type(e).__name__}: {e}; " + f"failing closed for " + f"{identity.principal_id!r} cap={capability!r}" + ) + raise _auth_failure() + + ttl = max(0, min(int(ttl or 0), AUTHZ_CACHE_TTL_MAX)) + self._authz_cache[key] = (bool(allow), now + ttl) + + if not allow: + raise _access_denied() + return diff --git a/trustgraph-flow/trustgraph/gateway/capabilities.py b/trustgraph-flow/trustgraph/gateway/capabilities.py new file mode 100644 index 00000000..72ca51c7 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/capabilities.py @@ -0,0 +1,100 @@ +""" +Gateway-side authorisation entry points. + +Under the IAM contract (see docs/tech-specs/iam-contract.md) the +gateway holds *no* policy state. Roles, capability sets, and +workspace-scope rules all live in the IAM regime (iam-svc for OSS). +This module is the thin surface the gateway uses to ask the regime +for a decision: + +- ``PUBLIC`` / ``AUTHENTICATED`` sentinels for endpoints that don't + go through capability-based authorisation. +- :func:`enforce` — authenticate-only, then ask the regime. +- :func:`enforce_workspace` — default-fill the workspace from the + caller's bound workspace and ask the regime, with the workspace + treated as the resource address. + +The capability strings themselves are an open vocabulary — see +docs/tech-specs/capabilities.md. The gateway does not validate them +beyond passing them through; an unknown capability simply produces a +deny verdict from the regime. +""" + +from aiohttp import web + + +PUBLIC = "__public__" +AUTHENTICATED = "__authenticated__" + + +def access_denied(): + return web.HTTPForbidden( + text='{"error":"access denied"}', + content_type="application/json", + ) + + +def auth_failure(): + return web.HTTPUnauthorized( + text='{"error":"auth failure"}', + content_type="application/json", + ) + + +async def enforce(request, auth, capability): + """Authenticate the caller and (for non-sentinel capabilities) + ask the IAM regime whether they may invoke ``capability``. + + The resource is system-level (``{}``) and parameters are empty — + use :func:`enforce_workspace` for workspace-scoped endpoints, or + drive authorisation through the operation registry for richer + cases. + + - ``PUBLIC``: returns ``None`` — no authentication. + - ``AUTHENTICATED``: returns the ``Identity`` — no authorisation. + - capability string: returns the ``Identity`` if the regime + allows; raises ``HTTPForbidden`` otherwise. + """ + if capability == PUBLIC: + return None + + identity = await auth.authenticate(request) + + if capability == AUTHENTICATED: + return identity + + await auth.authorise(identity, capability, {}, {}) + return identity + + +async def enforce_workspace(data, identity, auth, capability=None): + """Default-fill the workspace on a request body and (optionally) + authorise the caller for ``capability`` against that workspace. + + - Target workspace = ``data["workspace"]`` if supplied, else the + caller's bound workspace. + - On success, ``data["workspace"]`` is overwritten with the + resolved value so downstream code sees a single canonical + address. + - When ``capability`` is given, the regime is asked whether the + caller may invoke ``capability`` on ``{workspace: target}``. + Raises ``HTTPForbidden`` on a deny. + + For ``capability=None`` no authorisation call is made — the + caller has presumably already authorised via :func:`enforce` + (handy for endpoints that authorise once then resolve workspace + on the body before forwarding). + """ + if not isinstance(data, dict): + return data + + requested = data.get("workspace", "") + target = requested or identity.workspace + data["workspace"] = target + + if capability is not None: + await auth.authorise( + identity, capability, {"workspace": target}, {}, + ) + + return data diff --git a/trustgraph-flow/trustgraph/gateway/dispatch/iam.py b/trustgraph-flow/trustgraph/gateway/dispatch/iam.py new file mode 100644 index 00000000..386233f5 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/dispatch/iam.py @@ -0,0 +1,40 @@ + +from ... schema import IamRequest, IamResponse +from ... schema import iam_request_queue, iam_response_queue +from ... messaging import TranslatorRegistry + +from . requestor import ServiceRequestor + + +class IamRequestor(ServiceRequestor): + def __init__(self, backend, consumer, subscriber, timeout=120, + request_queue=None, response_queue=None): + + if request_queue is None: + request_queue = iam_request_queue + if response_queue is None: + response_queue = iam_response_queue + + super().__init__( + backend=backend, + consumer_name=consumer, + subscription=subscriber, + request_queue=request_queue, + response_queue=response_queue, + request_schema=IamRequest, + response_schema=IamResponse, + timeout=timeout, + ) + + self.request_translator = ( + TranslatorRegistry.get_request_translator("iam") + ) + self.response_translator = ( + TranslatorRegistry.get_response_translator("iam") + ) + + def to_request(self, body): + return self.request_translator.decode(body) + + def from_response(self, message): + return self.response_translator.encode_with_completion(message) diff --git a/trustgraph-flow/trustgraph/gateway/dispatch/manager.py b/trustgraph-flow/trustgraph/gateway/dispatch/manager.py index b238bb5b..ea8770d7 100644 --- a/trustgraph-flow/trustgraph/gateway/dispatch/manager.py +++ b/trustgraph-flow/trustgraph/gateway/dispatch/manager.py @@ -9,6 +9,7 @@ logger = logging.getLogger(__name__) from . config import ConfigRequestor from . flow import FlowRequestor +from . iam import IamRequestor from . librarian import LibrarianRequestor from . knowledge import KnowledgeRequestor from . collection_management import CollectionManagementRequestor @@ -72,6 +73,7 @@ request_response_dispatchers = { global_dispatchers = { "config": ConfigRequestor, "flow": FlowRequestor, + "iam": IamRequestor, "librarian": LibrarianRequestor, "knowledge": KnowledgeRequestor, "collection-management": CollectionManagementRequestor, @@ -105,13 +107,31 @@ class DispatcherWrapper: class DispatcherManager: - def __init__(self, backend, config_receiver, prefix="api-gateway", - queue_overrides=None): + def __init__(self, backend, config_receiver, auth, + prefix="api-gateway", queue_overrides=None): + """ + ``auth`` is required. It flows into the Mux for first-frame + WebSocket authentication and into downstream dispatcher + construction. There is no permissive default — constructing + a DispatcherManager without an authenticator would be a + silent downgrade to no-auth on the socket path. + """ + if auth is None: + raise ValueError( + "DispatcherManager requires an 'auth' argument — there " + "is no no-auth mode" + ) + self.backend = backend self.config_receiver = config_receiver self.config_receiver.add_handler(self) self.prefix = prefix + # Gateway IamAuth — used by the socket Mux for first-frame + # auth and by any dispatcher that needs to resolve caller + # identity out-of-band. + self.auth = auth + # Store queue overrides for global services # Format: {"config": {"request": "...", "response": "..."}, ...} self.queue_overrides = queue_overrides or {} @@ -163,6 +183,15 @@ class DispatcherManager: def dispatch_global_service(self): return DispatcherWrapper(self.process_global_service) + def dispatch_auth_iam(self): + """Pre-configured IAM dispatcher for the gateway's auth + endpoints (login, bootstrap, change-password). Pins the + kind to ``iam`` so these handlers don't have to supply URL + params the global dispatcher would expect.""" + async def _process(data, responder): + return await self.invoke_global_service(data, responder, "iam") + return DispatcherWrapper(_process) + def dispatch_core_export(self): return DispatcherWrapper(self.process_core_export) @@ -314,7 +343,10 @@ class DispatcherManager: async def process_socket(self, ws, running, params): - dispatcher = Mux(self, ws, running) + # The mux self-authenticates via the first-frame protocol; + # pass the gateway's IamAuth so it can validate tokens + # without reaching back into the endpoint layer. + dispatcher = Mux(self, ws, running, auth=self.auth) return dispatcher diff --git a/trustgraph-flow/trustgraph/gateway/dispatch/mux.py b/trustgraph-flow/trustgraph/gateway/dispatch/mux.py index 3d610dca..03cd748b 100644 --- a/trustgraph-flow/trustgraph/gateway/dispatch/mux.py +++ b/trustgraph-flow/trustgraph/gateway/dispatch/mux.py @@ -16,11 +16,28 @@ MAX_QUEUE_SIZE = 10 class Mux: - def __init__(self, dispatcher_manager, ws, running): + def __init__(self, dispatcher_manager, ws, running, auth): + """ + ``auth`` is required — the Mux implements the first-frame + auth protocol described in ``iam.md`` and will refuse any + non-auth frame until an ``auth-ok`` has been issued. There + is no no-auth mode. + """ + if auth is None: + raise ValueError( + "Mux requires an 'auth' argument — there is no " + "no-auth mode" + ) self.dispatcher_manager = dispatcher_manager self.ws = ws self.running = running + self.auth = auth + + # Authenticated identity, populated by the first-frame auth + # protocol. ``None`` means the socket is not yet + # authenticated; any non-auth frame is refused. + self.identity = None self.q = asyncio.Queue(maxsize=MAX_QUEUE_SIZE) @@ -31,6 +48,41 @@ class Mux: if self.ws: await self.ws.close() + async def _handle_auth_frame(self, data): + """Process a ``{"type": "auth", "token": "..."}`` frame. + On success, updates ``self.identity`` and returns an + ``auth-ok`` response frame. On failure, returns the masked + auth-failure frame. Never raises — auth failures keep the + socket open so the client can retry without reconnecting + (important for browsers, which treat a handshake-time 401 + as terminal).""" + token = data.get("token", "") + if not token: + await self.ws.send_json({ + "type": "auth-failed", + "error": "auth failure", + }) + return + + class _Shim: + def __init__(self, tok): + self.headers = {"Authorization": f"Bearer {tok}"} + + try: + identity = await self.auth.authenticate(_Shim(token)) + except Exception: + await self.ws.send_json({ + "type": "auth-failed", + "error": "auth failure", + }) + return + + self.identity = identity + await self.ws.send_json({ + "type": "auth-ok", + "workspace": identity.workspace, + }) + async def receive(self, msg): request_id = None @@ -38,6 +90,16 @@ class Mux: try: data = msg.json() + + # In-band auth protocol: the client sends + # ``{"type": "auth", "token": "..."}`` as its first frame + # (and any time it wants to re-auth: JWT refresh, token + # rotation, etc). Auth is always required on a Mux — + # there is no no-auth mode. + if isinstance(data, dict) and data.get("type") == "auth": + await self._handle_auth_frame(data) + return + request_id = data.get("id") if "request" not in data: @@ -46,9 +108,125 @@ class Mux: if "id" not in data: raise RuntimeError("Bad message") + # Reject all non-auth frames until an ``auth-ok`` has + # been issued. + if self.identity is None: + await self.ws.send_json({ + "id": request_id, + "error": { + "message": "auth failure", + "type": "auth-required", + }, + "complete": True, + }) + return + + # Per-service capability gating. Resolved through the + # operation registry so the WS path matches what HTTP + # callers see — same authority, same caps. + # + # Lookup mirrors the HTTP routing decision in + # ``request_task``: presence of ``flow`` on the envelope + # means a flow-level data-plane service (graph-rag, + # agent, …); absence means a workspace-level service + # (config, flow management, librarian, …) whose specific + # operation is in the inner request body. ``iam`` is + # treated as workspace-level too — its operations are + # registered with bare names, no kind prefix. + from ..registry import lookup as _registry_lookup + from ..capabilities import enforce_workspace + from aiohttp import web as _web + + service = data.get("service", "") + inner = data.get("request") or {} + inner_op = inner.get("operation", "") if isinstance(inner, dict) else "" + + if data.get("flow"): + op = _registry_lookup(f"flow-service:{service}") + elif service == "iam": + op = _registry_lookup(inner_op) if inner_op else None + else: + op = _registry_lookup(f"{service}:{inner_op}") if inner_op else None + + if op is None: + await self.ws.send_json({ + "id": request_id, + "error": { + "message": "unknown service", + "type": "unknown-service", + }, + "complete": True, + }) + return + + # Resolve workspace first (default-fill from the caller's + # bound workspace), then ask the regime to authorise the + # service-level capability against the matched + # operation's resource shape. + try: + await enforce_workspace(data, self.identity, self.auth) + if isinstance(inner, dict): + await enforce_workspace(inner, self.identity, self.auth) + + if data.get("flow"): + resource = { + "workspace": data.get("workspace", ""), + "flow": data.get("flow", ""), + } + parameters = {} + else: + # Build a minimal RequestContext so the matched + # operation's own extractors decide resource and + # parameters — same path the HTTP endpoints take. + from ..registry import RequestContext + ctx = RequestContext( + body=inner if isinstance(inner, dict) else {}, + match_info={}, + identity=self.identity, + ) + resource = op.extract_resource(ctx) + parameters = op.extract_parameters(ctx) + + await self.auth.authorise( + self.identity, op.capability, resource, parameters, + ) + except _web.HTTPForbidden: + await self.ws.send_json({ + "id": request_id, + "error": { + "message": "access denied", + "type": "access-denied", + }, + "complete": True, + }) + return + except _web.HTTPUnauthorized: + await self.ws.send_json({ + "id": request_id, + "error": { + "message": "auth failure", + "type": "auth-required", + }, + "complete": True, + }) + return + + workspace = data["workspace"] + + # Plumb authenticated caller's handle as ``actor`` so + # iam-svc handlers (whoami, future actor-scoped checks) + # know who is calling. Overwrite any caller-supplied + # value so it can't be spoofed over the WS. + if ( + service == "iam" + and isinstance(data.get("request"), dict) + and self.identity is not None + ): + data["request"]["actor"] = self.identity.handle + await self.q.put(( data["id"], - data.get("workspace", "default"), + workspace, data.get("flow"), data["service"], data["request"] diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/auth_endpoints.py b/trustgraph-flow/trustgraph/gateway/endpoint/auth_endpoints.py new file mode 100644 index 00000000..44bbc03e --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/endpoint/auth_endpoints.py @@ -0,0 +1,131 @@ +""" +Gateway auth endpoints. + +Three dedicated paths: + POST /api/v1/auth/login — unauthenticated; username/password → JWT + POST /api/v1/auth/bootstrap — unauthenticated; IAM bootstrap op + POST /api/v1/auth/change-password — authenticated; any role + +These are the only IAM-surface operations that can be reached from +outside. Everything else routes through ``/api/v1/iam`` gated by +``users:admin``. +""" + +import logging + +from aiohttp import web + +from .. capabilities import enforce, PUBLIC, AUTHENTICATED + +logger = logging.getLogger("auth-endpoints") +logger.setLevel(logging.INFO) + + +class AuthEndpoints: + """Groups the three auth-surface handlers. Each forwards to the + IAM service via the existing ``IamRequestor`` dispatcher.""" + + def __init__(self, iam_dispatcher, auth): + self.iam = iam_dispatcher + self.auth = auth + + async def start(self): + pass + + def add_routes(self, app): + app.add_routes([ + web.post("/api/v1/auth/login", self.login), + web.post("/api/v1/auth/bootstrap", self.bootstrap), + web.post( + "/api/v1/auth/bootstrap-status", + self.bootstrap_status, + ), + web.post( + "/api/v1/auth/change-password", + self.change_password, + ), + ]) + + async def _forward(self, body): + async def responder(x, fin): + pass + return await self.iam.process(body, responder) + + async def login(self, request): + """Public. Accepts {username, password, workspace?}. Returns + {jwt, jwt_expires} on success; IAM's masked auth failure on + anything else.""" + await enforce(request, self.auth, PUBLIC) + try: + body = await request.json() + except Exception: + return web.json_response( + {"error": "invalid json"}, status=400, + ) + req = { + "operation": "login", + "username": body.get("username", ""), + "password": body.get("password", ""), + "workspace": body.get("workspace", ""), + } + resp = await self._forward(req) + if "error" in resp: + return web.json_response( + {"error": "auth failure"}, status=401, + ) + return web.json_response(resp) + + async def bootstrap(self, request): + """Public. Valid only when IAM is running in bootstrap mode + with empty tables. In every other case the IAM service + returns a masked auth-failure.""" + await enforce(request, self.auth, PUBLIC) + resp = await self._forward({"operation": "bootstrap"}) + if "error" in resp: + return web.json_response( + {"error": "auth failure"}, status=401, + ) + return web.json_response(resp) + + async def bootstrap_status(self, request): + """Public, side-effect-free. Returns ``{"bootstrap_available": + bool}`` so a UI can decide whether to render first-run setup + without invoking the consuming ``bootstrap`` op.""" + await enforce(request, self.auth, PUBLIC) + resp = await self._forward({"operation": "bootstrap-status"}) + if "error" in resp: + return web.json_response( + {"error": "auth failure"}, status=401, + ) + return web.json_response(resp) + + async def change_password(self, request): + """Authenticated (any role). Accepts {current_password, + new_password}; user_id is taken from the authenticated + identity — the caller cannot change someone else's password + this way (reset-password is the admin path).""" + identity = await enforce(request, self.auth, AUTHENTICATED) + try: + body = await request.json() + except Exception: + return web.json_response( + {"error": "invalid json"}, status=400, + ) + req = { + "operation": "change-password", + "user_id": identity.handle, + "password": body.get("current_password", ""), + "new_password": body.get("new_password", ""), + } + resp = await self._forward(req) + if "error" in resp: + err_type = resp.get("error", {}).get("type", "") + if err_type == "auth-failed": + return web.json_response( + {"error": "auth failure"}, status=401, + ) + return web.json_response( + {"error": resp.get("error", {}).get("message", "error")}, + status=400, + ) + return web.json_response(resp) diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/constant_endpoint.py b/trustgraph-flow/trustgraph/gateway/endpoint/constant_endpoint.py index 58ba1738..920b02ca 100644 --- a/trustgraph-flow/trustgraph/gateway/endpoint/constant_endpoint.py +++ b/trustgraph-flow/trustgraph/gateway/endpoint/constant_endpoint.py @@ -1,28 +1,27 @@ -import asyncio -from aiohttp import web -import uuid import logging +from aiohttp import web + +from .. capabilities import enforce, enforce_workspace + logger = logging.getLogger("endpoint") logger.setLevel(logging.INFO) + class ConstantEndpoint: - def __init__(self, endpoint_path, auth, dispatcher): + def __init__(self, endpoint_path, auth, dispatcher, capability): self.path = endpoint_path - self.auth = auth - self.operation = "service" - + self.capability = capability self.dispatcher = dispatcher async def start(self): pass def add_routes(self, app): - app.add_routes([ web.post(self.path, self.handle), ]) @@ -31,22 +30,14 @@ class ConstantEndpoint: logger.debug(f"Processing request: {request.path}") - try: - ht = request.headers["Authorization"] - tokens = ht.split(" ", 2) - if tokens[0] != "Bearer": - return web.HTTPUnauthorized() - token = tokens[1] - except: - token = "" - - if not self.auth.permitted(token, self.operation): - return web.HTTPUnauthorized() + identity = await enforce(request, self.auth, self.capability) try: - data = await request.json() + if identity is not None: + await enforce_workspace(data, identity, self.auth) + async def responder(x, fin): pass @@ -54,10 +45,8 @@ class ConstantEndpoint: return web.json_response(resp) + except web.HTTPException: + raise except Exception as e: - logging.error(f"Exception: {e}") - - return web.json_response( - { "error": str(e) } - ) - + logger.error(f"Exception: {e}", exc_info=True) + return web.json_response({"error": str(e)}) diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/i18n.py b/trustgraph-flow/trustgraph/gateway/endpoint/i18n.py index b949a499..f28f293d 100644 --- a/trustgraph-flow/trustgraph/gateway/endpoint/i18n.py +++ b/trustgraph-flow/trustgraph/gateway/endpoint/i18n.py @@ -4,16 +4,18 @@ from aiohttp import web from trustgraph.i18n import get_language_pack +from .. capabilities import enforce + logger = logging.getLogger("endpoint") logger.setLevel(logging.INFO) class I18nPackEndpoint: - def __init__(self, endpoint_path: str, auth): + def __init__(self, endpoint_path: str, auth, capability): self.path = endpoint_path self.auth = auth - self.operation = "service" + self.capability = capability async def start(self): pass @@ -26,26 +28,13 @@ class I18nPackEndpoint: async def handle(self, request): logger.debug(f"Processing i18n pack request: {request.path}") - token = "" - try: - ht = request.headers["Authorization"] - tokens = ht.split(" ", 2) - if tokens[0] != "Bearer": - return web.HTTPUnauthorized() - token = tokens[1] - except Exception: - token = "" - - if not self.auth.permitted(token, self.operation): - return web.HTTPUnauthorized() + await enforce(request, self.auth, self.capability) lang = request.match_info.get("lang") or "en" - # This is a path traversal defense, and is a critical sec defense. - # Do not remove! + # Path-traversal defense — critical, do not remove. if "/" in lang or ".." in lang: return web.HTTPBadRequest(reason="Invalid language code") pack = get_language_pack(lang) - return web.json_response(pack) diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/iam_endpoint.py b/trustgraph-flow/trustgraph/gateway/endpoint/iam_endpoint.py new file mode 100644 index 00000000..749eacd3 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/endpoint/iam_endpoint.py @@ -0,0 +1,114 @@ +""" +Registry-driven /api/v1/iam endpoint. + +The gateway no longer gates IAM management with a single coarse +``users:admin`` capability. Instead, each operation declares its +own capability + resource shape in the registry (``registry.py``); +this endpoint reads the body's ``operation`` field, looks up the +declaration, and asks the IAM regime to authorise the call. + +Operations not in the registry produce a 400 ``unknown operation``. +This is the gateway's primary mechanism for fail-closed gating of +the IAM surface — the registry is the source of truth. +""" + +import logging + +from aiohttp import web + +from .. capabilities import ( + PUBLIC, AUTHENTICATED, auth_failure, +) +from .. registry import lookup, RequestContext + +logger = logging.getLogger("iam-endpoint") +logger.setLevel(logging.INFO) + + +class IamEndpoint: + """POST /api/v1/iam — generic forwarder gated by the operation + registry. The IAM dispatcher (``iam_dispatcher``) forwards the + body verbatim to iam-svc once authorisation succeeds.""" + + def __init__(self, endpoint_path, auth, dispatcher): + self.path = endpoint_path + self.auth = auth + self.dispatcher = dispatcher + + async def start(self): + pass + + def add_routes(self, app): + app.add_routes([web.post(self.path, self.handle)]) + + async def handle(self, request): + try: + body = await request.json() + except Exception: + return web.json_response( + {"error": "invalid json"}, status=400, + ) + if not isinstance(body, dict): + return web.json_response( + {"error": "body must be an object"}, status=400, + ) + + op_name = body.get("operation", "") + op = lookup(op_name) + if op is None: + return web.json_response( + {"error": "unknown operation"}, status=400, + ) + + # Authentication: required for everything except PUBLIC. + identity = None + if op.capability != PUBLIC: + try: + identity = await self.auth.authenticate(request) + except web.HTTPException: + raise + + # Authorisation: capability sentinels short-circuit the + # regime call; capability strings go through authorise(). + if op.capability not in (PUBLIC, AUTHENTICATED): + ctx = RequestContext( + body=body, + match_info=dict(request.match_info), + identity=identity, + ) + try: + resource = op.extract_resource(ctx) + parameters = op.extract_parameters(ctx) + except Exception as e: + logger.warning( + f"extractor failed for {op_name!r}: " + f"{type(e).__name__}: {e}" + ) + return web.json_response( + {"error": "bad request"}, status=400, + ) + + await self.auth.authorise( + identity, op.capability, resource, parameters, + ) + + # Plumb the authenticated caller's handle through as ``actor`` + # so iam-svc handlers (e.g. whoami, future actor-scoped + # checks) know who is making the request. The gateway is + # the only authority for this — body-supplied ``actor`` + # values are overwritten so callers can't impersonate. + if identity is not None: + body["actor"] = identity.handle + + async def responder(x, fin): + pass + + try: + resp = await self.dispatcher.process(body, responder) + except web.HTTPException: + raise + except Exception as e: + logger.error(f"Exception: {e}", exc_info=True) + return web.json_response({"error": str(e)}) + + return web.json_response(resp) diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/manager.py b/trustgraph-flow/trustgraph/gateway/endpoint/manager.py index fb8b0b76..ed5ef4b5 100644 --- a/trustgraph-flow/trustgraph/gateway/endpoint/manager.py +++ b/trustgraph-flow/trustgraph/gateway/endpoint/manager.py @@ -8,72 +8,269 @@ from . variable_endpoint import VariableEndpoint from . socket import SocketEndpoint from . metrics import MetricsEndpoint from . i18n import I18nPackEndpoint +from . auth_endpoints import AuthEndpoints +from . iam_endpoint import IamEndpoint +from . registry_endpoint import RegistryRoutedVariableEndpoint + +from .. capabilities import PUBLIC, AUTHENTICATED, auth_failure +from .. registry import lookup as _registry_lookup, RequestContext from .. dispatch.manager import DispatcherManager + +# /api/v1/{kind} (config / flow / librarian / knowledge / +# collection-management), /api/v1/iam, and /api/v1/flow/{flow}/... +# routes are all gated per-operation by the registry, not by a +# per-kind capability map. Login / bootstrap / change-password are +# served by AuthEndpoints with their own PUBLIC / AUTHENTICATED +# sentinels. + + +import logging as _mgr_logging +_mgr_logger = _mgr_logging.getLogger("endpoint") + + +class _RoutedVariableEndpoint: + """HTTP endpoint that gates per request via the operation + registry. The URL's ``kind`` parameter combined with a fixed + ``registry_prefix`` yields the registry key — e.g. prefix + ``flow-service`` and kind ``agent`` looks up + ``flow-service:agent``. + + Used for ``/api/v1/flow/{flow}/service/{kind}`` (per-flow + data-plane services). ``/api/v1/{kind}`` (workspace-level + global services) goes through ``RegistryRoutedVariableEndpoint`` + which discriminates on body operation as well as URL kind.""" + + def __init__(self, endpoint_path, auth, dispatcher, registry_prefix): + self.path = endpoint_path + self.auth = auth + self.dispatcher = dispatcher + self._registry_prefix = registry_prefix + + async def start(self): + pass + + def add_routes(self, app): + app.add_routes([web.post(self.path, self.handle)]) + + async def handle(self, request): + kind = request.match_info.get("kind", "") + op = _registry_lookup(f"{self._registry_prefix}:{kind}") + if op is None: + return web.json_response( + {"error": "unknown kind"}, status=404, + ) + + identity = await self.auth.authenticate(request) + + try: + data = await request.json() + ctx = RequestContext( + body=data if isinstance(data, dict) else {}, + match_info=dict(request.match_info), + identity=identity, + ) + resource = op.extract_resource(ctx) + parameters = op.extract_parameters(ctx) + await self.auth.authorise( + identity, op.capability, resource, parameters, + ) + + async def responder(x, fin): + pass + + resp = await self.dispatcher.process( + data, responder, request.match_info, + ) + return web.json_response(resp) + + except web.HTTPException: + raise + except Exception as e: + _mgr_logger.error(f"Exception: {e}", exc_info=True) + return web.json_response({"error": str(e)}) + + +class _RoutedSocketEndpoint: + """WebSocket endpoint gated per request via the operation + registry. Like ``_RoutedVariableEndpoint`` but for the + streaming flow import / export socket paths.""" + + def __init__(self, endpoint_path, auth, dispatcher, registry_prefix): + self.path = endpoint_path + self.auth = auth + self.dispatcher = dispatcher + self._registry_prefix = registry_prefix + + async def start(self): + pass + + def add_routes(self, app): + app.add_routes([web.get(self.path, self.handle)]) + + async def handle(self, request): + kind = request.match_info.get("kind", "") + op = _registry_lookup(f"{self._registry_prefix}:{kind}") + if op is None: + return web.json_response( + {"error": "unknown kind"}, status=404, + ) + + token = request.query.get("token", "") + if not token: + return auth_failure() + + from . socket import _QueryTokenRequest + try: + identity = await self.auth.authenticate( + _QueryTokenRequest(token) + ) + except web.HTTPException as e: + return e + + ctx = RequestContext( + body={}, + match_info=dict(request.match_info), + identity=identity, + ) + try: + resource = op.extract_resource(ctx) + parameters = op.extract_parameters(ctx) + await self.auth.authorise( + identity, op.capability, resource, parameters, + ) + except web.HTTPException as e: + return e + + # Delegate the websocket handling to a standalone SocketEndpoint + # with the resolved capability, bypassing the per-request mutation + # concern by instantiating fresh state. + ws_ep = SocketEndpoint( + endpoint_path=self.path, + auth=self.auth, + dispatcher=self.dispatcher, + capability=op.capability, + ) + return await ws_ep.handle(request) + + class EndpointManager: def __init__( - self, dispatcher_manager, auth, prometheus_url, timeout=600 + self, dispatcher_manager, auth, prometheus_url, timeout=600, ): self.dispatcher_manager = dispatcher_manager self.timeout = timeout - self.services = { - } - self.endpoints = [ + + # Auth surface — public / authenticated-any. Must come + # before the generic /api/v1/{kind} routes to win the + # match for /api/v1/auth/* paths. aiohttp routes in + # registration order, so we prepend here. + AuthEndpoints( + iam_dispatcher=dispatcher_manager.dispatch_auth_iam(), + auth=auth, + ), + + # /api/v1/iam — registry-driven IAM management. Per + # operation gating happens inside IamEndpoint via the + # operation registry; the dispatcher forwards verbatim + # to iam-svc once authorisation has succeeded. Listed + # before the generic /api/v1/{kind} route so it wins + # the match for "iam". + IamEndpoint( + endpoint_path="/api/v1/iam", + auth=auth, + dispatcher=dispatcher_manager.dispatch_auth_iam(), + ), + I18nPackEndpoint( - endpoint_path = "/api/v1/i18n/packs/{lang}", - auth = auth, + endpoint_path="/api/v1/i18n/packs/{lang}", + auth=auth, + capability=PUBLIC, ), MetricsEndpoint( - endpoint_path = "/api/metrics", - prometheus_url = prometheus_url, - auth = auth, + endpoint_path="/api/metrics", + prometheus_url=prometheus_url, + auth=auth, + capability="metrics:read", ), - VariableEndpoint( - endpoint_path = "/api/v1/{kind}", auth = auth, - dispatcher = dispatcher_manager.dispatch_global_service(), + + # Global services: registry-driven per-operation gating. + # Each kind+op combination has a registry entry that + # declares its capability and resource shape. Listed + # after the IAM and auth-surface routes; aiohttp's + # path matcher prefers the more-specific path so this + # variable route doesn't shadow them. + RegistryRoutedVariableEndpoint( + endpoint_path="/api/v1/{kind}", + auth=auth, + dispatcher=dispatcher_manager.dispatch_global_service(), ), + + # /api/v1/socket: WebSocket handshake accepts + # unconditionally; the Mux dispatcher runs the + # first-frame auth protocol. Handshake-time 401s break + # browser reconnection, so authentication is always + # in-band for this endpoint. SocketEndpoint( - endpoint_path = "/api/v1/socket", - auth = auth, - dispatcher = dispatcher_manager.dispatch_socket() + endpoint_path="/api/v1/socket", + auth=auth, + dispatcher=dispatcher_manager.dispatch_socket(), + capability=AUTHENTICATED, # informational only; bypassed + in_band_auth=True, ), - VariableEndpoint( - endpoint_path = "/api/v1/flow/{flow}/service/{kind}", - auth = auth, - dispatcher = dispatcher_manager.dispatch_flow_service(), + + # Per-flow request/response services — gated per + # ``flow-service:`` registry entry. + _RoutedVariableEndpoint( + endpoint_path="/api/v1/flow/{flow}/service/{kind}", + auth=auth, + dispatcher=dispatcher_manager.dispatch_flow_service(), + registry_prefix="flow-service", ), - SocketEndpoint( - endpoint_path = "/api/v1/flow/{flow}/import/{kind}", - auth = auth, - dispatcher = dispatcher_manager.dispatch_flow_import() + + # Per-flow streaming import/export — gated per + # ``flow-import:`` / ``flow-export:`` registry + # entry. + _RoutedSocketEndpoint( + endpoint_path="/api/v1/flow/{flow}/import/{kind}", + auth=auth, + dispatcher=dispatcher_manager.dispatch_flow_import(), + registry_prefix="flow-import", ), - SocketEndpoint( - endpoint_path = "/api/v1/flow/{flow}/export/{kind}", - auth = auth, - dispatcher = dispatcher_manager.dispatch_flow_export() + _RoutedSocketEndpoint( + endpoint_path="/api/v1/flow/{flow}/export/{kind}", + auth=auth, + dispatcher=dispatcher_manager.dispatch_flow_export(), + registry_prefix="flow-export", + ), + + StreamEndpoint( + endpoint_path="/api/v1/import-core", + auth=auth, + method="POST", + dispatcher=dispatcher_manager.dispatch_core_import(), + # Cross-subject import — require the admin bundle via a + # single representative capability. + capability="users:admin", ), StreamEndpoint( - endpoint_path = "/api/v1/import-core", - auth = auth, - method = "POST", - dispatcher = dispatcher_manager.dispatch_core_import(), + endpoint_path="/api/v1/export-core", + auth=auth, + method="GET", + dispatcher=dispatcher_manager.dispatch_core_export(), + capability="users:admin", ), StreamEndpoint( - endpoint_path = "/api/v1/export-core", - auth = auth, - method = "GET", - dispatcher = dispatcher_manager.dispatch_core_export(), - ), - StreamEndpoint( - endpoint_path = "/api/v1/document-stream", - auth = auth, - method = "GET", - dispatcher = dispatcher_manager.dispatch_document_stream(), + endpoint_path="/api/v1/document-stream", + auth=auth, + method="GET", + dispatcher=dispatcher_manager.dispatch_document_stream(), + capability="documents:read", ), ] @@ -84,4 +281,3 @@ class EndpointManager: async def start(self): for ep in self.endpoints: await ep.start() - diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/metrics.py b/trustgraph-flow/trustgraph/gateway/endpoint/metrics.py index d17d111b..6832d1e3 100644 --- a/trustgraph-flow/trustgraph/gateway/endpoint/metrics.py +++ b/trustgraph-flow/trustgraph/gateway/endpoint/metrics.py @@ -10,17 +10,19 @@ import asyncio import uuid import logging +from .. capabilities import enforce + logger = logging.getLogger("endpoint") logger.setLevel(logging.INFO) class MetricsEndpoint: - def __init__(self, prometheus_url, endpoint_path, auth): + def __init__(self, prometheus_url, endpoint_path, auth, capability): self.prometheus_url = prometheus_url self.path = endpoint_path self.auth = auth - self.operation = "service" + self.capability = capability async def start(self): pass @@ -35,38 +37,39 @@ class MetricsEndpoint: logger.debug(f"Processing metrics request: {request.path}") - try: - ht = request.headers["Authorization"] - tokens = ht.split(" ", 2) - if tokens[0] != "Bearer": - return web.HTTPUnauthorized() - token = tokens[1] - except: - token = "" + await enforce(request, self.auth, self.capability) - if not self.auth.permitted(token, self.operation): - return web.HTTPUnauthorized() + path = request.match_info["path"] + url = ( + self.prometheus_url + "/api/v1/" + path + "?" + + request.query_string + ) try: - path = request.match_info["path"] - async with aiohttp.ClientSession() as session: - - url = ( - self.prometheus_url + "/api/v1/" + path + "?" + - request.query_string - ) - async with session.get(url) as resp: return web.Response( status=resp.status, text=await resp.text() ) + except aiohttp.ClientConnectionError as e: + + # Upstream unreachable (connect refused, DNS failure, + # server disconnect). Distinguish from our own errors so + # callers know where the fault is. + logger.error(f"Metrics upstream {url} unreachable: {e}") + return web.Response( + status=502, + text=f"Bad Gateway: metrics upstream unreachable: {e}", + ) + except Exception as e: - logging.error(f"Exception: {e}") - - raise web.HTTPInternalServerError() + logger.error(f"Metrics proxy exception: {e}", exc_info=True) + return web.Response( + status=500, + text=f"Internal Server Error: {e}", + ) diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/registry_endpoint.py b/trustgraph-flow/trustgraph/gateway/endpoint/registry_endpoint.py new file mode 100644 index 00000000..296376fa --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/endpoint/registry_endpoint.py @@ -0,0 +1,123 @@ +""" +Registry-driven dispatch for ``/api/v1/{kind}`` global services. + +The body's ``operation`` field plus the URL's ``{kind}`` together +form the canonical operation name (``:``) that the +gateway looks up in ``registry.py``. The matched operation +declares its capability and resource shape; this endpoint asks the +IAM regime to authorise the call before forwarding the body +verbatim to the backend dispatcher. + +The dispatcher is the same ``dispatch_global_service()`` factory the +old coarse path used; only the gating layer has changed. + +Operations not present in the registry are rejected with 400 +``unknown operation`` — fail closed. +""" + +import logging + +from aiohttp import web + +from .. capabilities import ( + PUBLIC, AUTHENTICATED, auth_failure, +) +from .. registry import lookup, RequestContext + +logger = logging.getLogger("registry-endpoint") +logger.setLevel(logging.INFO) + + +class RegistryRoutedVariableEndpoint: + """POST /api/v1/{kind} — kind comes from the URL, operation comes + from the body, both are joined as the registry key.""" + + def __init__(self, endpoint_path, auth, dispatcher): + self.path = endpoint_path + self.auth = auth + self.dispatcher = dispatcher + + async def start(self): + pass + + def add_routes(self, app): + app.add_routes([web.post(self.path, self.handle)]) + + async def handle(self, request): + kind = request.match_info.get("kind", "") + if not kind: + return web.json_response( + {"error": "missing kind"}, status=404, + ) + + try: + body = await request.json() + except Exception: + return web.json_response( + {"error": "invalid json"}, status=400, + ) + if not isinstance(body, dict): + return web.json_response( + {"error": "body must be an object"}, status=400, + ) + + op_name = body.get("operation", "") + if not op_name: + return web.json_response( + {"error": "missing operation"}, status=400, + ) + + registry_key = f"{kind}:{op_name}" + op = lookup(registry_key) + if op is None: + return web.json_response( + {"error": "unknown operation"}, status=400, + ) + + identity = None + if op.capability != PUBLIC: + identity = await self.auth.authenticate(request) + + if op.capability not in (PUBLIC, AUTHENTICATED): + ctx = RequestContext( + body=body, + match_info=dict(request.match_info), + identity=identity, + ) + try: + resource = op.extract_resource(ctx) + parameters = op.extract_parameters(ctx) + except Exception as e: + logger.warning( + f"extractor failed for {registry_key!r}: " + f"{type(e).__name__}: {e}" + ) + return web.json_response( + {"error": "bad request"}, status=400, + ) + + await self.auth.authorise( + identity, op.capability, resource, parameters, + ) + + # Default-fill workspace into the body so downstream + # dispatchers see the canonical resolved value. The + # extractor has already pulled the workspace out; + # mirror it back to the body for the verbatim forward. + if "workspace" in resource: + body["workspace"] = resource["workspace"] + + async def responder(x, fin): + pass + + try: + resp = await self.dispatcher.process( + body, responder, request.match_info, + ) + except web.HTTPException: + raise + except Exception as e: + logger.error(f"Exception: {e}", exc_info=True) + return web.json_response({"error": str(e)}) + + return web.json_response(resp) diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/socket.py b/trustgraph-flow/trustgraph/gateway/endpoint/socket.py index 9065761c..f53ad73b 100644 --- a/trustgraph-flow/trustgraph/gateway/endpoint/socket.py +++ b/trustgraph-flow/trustgraph/gateway/endpoint/socket.py @@ -4,6 +4,9 @@ from aiohttp import web, WSMsgType import logging from .. running import Running +from .. capabilities import ( + PUBLIC, AUTHENTICATED, auth_failure, +) logger = logging.getLogger("socket") logger.setLevel(logging.INFO) @@ -11,12 +14,25 @@ logger.setLevel(logging.INFO) class SocketEndpoint: def __init__( - self, endpoint_path, auth, dispatcher, + self, endpoint_path, auth, dispatcher, capability, + in_band_auth=False, ): + """ + ``in_band_auth=True`` skips the handshake-time auth check. + The WebSocket handshake always succeeds; the dispatcher is + expected to gate itself via the first-frame auth protocol + (see ``Mux``). + + This avoids the browser problem where a 401 on the handshake + is treated as permanent and prevents reconnection, and lets + long-lived sockets refresh their credential mid-session by + sending a new auth frame. + """ self.path = endpoint_path self.auth = auth - self.operation = "socket" + self.capability = capability + self.in_band_auth = in_band_auth self.dispatcher = dispatcher @@ -61,15 +77,33 @@ class SocketEndpoint: raise async def handle(self, request): - """Enhanced handler with better cleanup""" - try: - token = request.query['token'] - except: - token = "" + """Enhanced handler with better cleanup. + + Auth: WebSocket clients pass the bearer token on the + ``?token=...`` query string; we wrap it into a synthetic + Authorization header before delegating to the standard auth + path so the IAM-backed flow (JWT / API key) applies uniformly. + The first-frame auth protocol described in the IAM spec is + a future upgrade.""" + + if not self.in_band_auth and self.capability != PUBLIC: + token = request.query.get("token", "") + if not token: + return auth_failure() + try: + identity = await self.auth.authenticate( + _QueryTokenRequest(token) + ) + except web.HTTPException as e: + return e + if self.capability != AUTHENTICATED: + try: + await self.auth.authorise( + identity, self.capability, {}, {}, + ) + except web.HTTPException as e: + return e - if not self.auth.permitted(token, self.operation): - return web.HTTPUnauthorized() - # 50MB max message size ws = web.WebSocketResponse(max_msg_size=52428800) @@ -150,3 +184,11 @@ class SocketEndpoint: web.get(self.path, self.handle), ]) + +class _QueryTokenRequest: + """Minimal shim that exposes headers["Authorization"] to + IamAuth.authenticate(), derived from a query-string token.""" + + def __init__(self, token): + self.headers = {"Authorization": f"Bearer {token}"} + diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/stream_endpoint.py b/trustgraph-flow/trustgraph/gateway/endpoint/stream_endpoint.py index 38d8846f..7b0c4692 100644 --- a/trustgraph-flow/trustgraph/gateway/endpoint/stream_endpoint.py +++ b/trustgraph-flow/trustgraph/gateway/endpoint/stream_endpoint.py @@ -1,82 +1,64 @@ -import asyncio -from aiohttp import web import logging +from aiohttp import web + +from .. capabilities import enforce + logger = logging.getLogger("endpoint") logger.setLevel(logging.INFO) + class StreamEndpoint: - def __init__(self, endpoint_path, auth, dispatcher, method="POST"): - + def __init__( + self, endpoint_path, auth, dispatcher, capability, method="POST", + ): self.path = endpoint_path - self.auth = auth - self.operation = "service" + self.capability = capability self.method = method - self.dispatcher = dispatcher async def start(self): pass def add_routes(self, app): - if self.method == "POST": - app.add_routes([ - web.post(self.path, self.handle), - ]) + app.add_routes([web.post(self.path, self.handle)]) elif self.method == "GET": - app.add_routes([ - web.get(self.path, self.handle), - ]) + app.add_routes([web.get(self.path, self.handle)]) else: - raise RuntimeError("Bad method" + self.method) + raise RuntimeError("Bad method " + self.method) async def handle(self, request): logger.debug(f"Processing request: {request.path}") - try: - ht = request.headers["Authorization"] - tokens = ht.split(" ", 2) - if tokens[0] != "Bearer": - return web.HTTPUnauthorized() - token = tokens[1] - except: - token = "" - - if not self.auth.permitted(token, self.operation): - return web.HTTPUnauthorized() + await enforce(request, self.auth, self.capability) try: - data = request.content async def error(err): - return web.HTTPInternalServerError(text = err) + return web.HTTPInternalServerError(text=err) async def ok( - status=200, reason="OK", type="application/octet-stream" + status=200, reason="OK", + type="application/octet-stream", ): response = web.StreamResponse( - status = status, reason = reason, - headers = {"Content-Type": type} + status=status, reason=reason, + headers={"Content-Type": type}, ) await response.prepare(request) return response - resp = await self.dispatcher.process( - data, error, ok, request - ) - + resp = await self.dispatcher.process(data, error, ok, request) return resp + except web.HTTPException: + raise except Exception as e: - logging.error(f"Exception: {e}") - - return web.json_response( - { "error": str(e) } - ) - + logger.error(f"Exception: {e}", exc_info=True) + return web.json_response({"error": str(e)}) diff --git a/trustgraph-flow/trustgraph/gateway/endpoint/variable_endpoint.py b/trustgraph-flow/trustgraph/gateway/endpoint/variable_endpoint.py index 608de71b..6a336f42 100644 --- a/trustgraph-flow/trustgraph/gateway/endpoint/variable_endpoint.py +++ b/trustgraph-flow/trustgraph/gateway/endpoint/variable_endpoint.py @@ -1,27 +1,27 @@ -import asyncio -from aiohttp import web import logging +from aiohttp import web + +from .. capabilities import enforce, enforce_workspace + logger = logging.getLogger("endpoint") logger.setLevel(logging.INFO) + class VariableEndpoint: - def __init__(self, endpoint_path, auth, dispatcher): + def __init__(self, endpoint_path, auth, dispatcher, capability): self.path = endpoint_path - self.auth = auth - self.operation = "service" - + self.capability = capability self.dispatcher = dispatcher async def start(self): pass def add_routes(self, app): - app.add_routes([ web.post(self.path, self.handle), ]) @@ -30,35 +30,25 @@ class VariableEndpoint: logger.debug(f"Processing request: {request.path}") - try: - ht = request.headers["Authorization"] - tokens = ht.split(" ", 2) - if tokens[0] != "Bearer": - return web.HTTPUnauthorized() - token = tokens[1] - except: - token = "" - - if not self.auth.permitted(token, self.operation): - return web.HTTPUnauthorized() + identity = await enforce(request, self.auth, self.capability) try: - data = await request.json() + if identity is not None: + await enforce_workspace(data, identity, self.auth) + async def responder(x, fin): pass resp = await self.dispatcher.process( - data, responder, request.match_info + data, responder, request.match_info, ) return web.json_response(resp) + except web.HTTPException: + raise except Exception as e: - logging.error(f"Exception: {e}") - - return web.json_response( - { "error": str(e) } - ) - + logger.error(f"Exception: {e}", exc_info=True) + return web.json_response({"error": str(e)}) diff --git a/trustgraph-flow/trustgraph/gateway/registry.py b/trustgraph-flow/trustgraph/gateway/registry.py new file mode 100644 index 00000000..5e3344f4 --- /dev/null +++ b/trustgraph-flow/trustgraph/gateway/registry.py @@ -0,0 +1,533 @@ +""" +Gateway operation registry. + +Single declarative table mapping each operation the gateway +recognises to: + +- The capability the IAM regime is asked to authorise against. +- The resource level (system / workspace / flow) — determines the + shape of the resource identifier handed to ``authorise``. +- Extractors that build the resource and parameters from the + request context. + +This is a gateway-internal concept. It is not part of the IAM +contract — the contract specifies what arguments ``authorise`` +receives; the registry is how the gateway populates them. + +See docs/tech-specs/iam-contract.md for the contract and +docs/tech-specs/iam.md for the request anatomy. +""" + +from dataclasses import dataclass, field +from typing import Any, Callable + + +# Sentinels for operations that don't go through capability-based +# authorisation. Mirror the values used in capabilities.py so the +# gateway endpoint layer can recognise them uniformly. +PUBLIC = "__public__" +AUTHENTICATED = "__authenticated__" + + +class ResourceLevel: + """Where the operation's resource lives. + + ``SYSTEM`` — operation acts on a deployment-level resource + (the user registry, the workspace registry, + the signing key). resource = {}. Workspace, + if relevant, is a parameter, not an address. + + ``WORKSPACE`` — operation acts on something within a workspace + (config, library, knowledge, collections, flow + lifecycle). resource = {workspace}. + + ``FLOW`` — operation acts on something within a flow + within a workspace (graph, agent, llm, etc.). + resource = {workspace, flow}. + """ + SYSTEM = "system" + WORKSPACE = "workspace" + FLOW = "flow" + + +@dataclass +class RequestContext: + """The bundle of inputs the registry's extractors operate on. + Assembled by the gateway from the incoming request after + authentication.""" + + # Parsed JSON body (HTTP) or inner request payload (WebSocket). + body: dict = field(default_factory=dict) + + # URL path components (HTTP) or WebSocket envelope routing + # fields (id, service, workspace, flow). + match_info: dict = field(default_factory=dict) + + # Authenticated identity for default-fill-in. Always present + # by the time extractors run, except for PUBLIC operations + # where it is None. + identity: Any = None + + +@dataclass +class Operation: + """Declared operation the gateway can dispatch + authorise.""" + + # Canonical operation name (used for registry lookup, audit, + # debug logs). Mirrors the operation strings in the IAM + # service and other backends where applicable. + name: str + + # Capability required to invoke this operation. Either a + # string from the capability vocabulary in capabilities.md, or + # the PUBLIC / AUTHENTICATED sentinel for operations that + # don't go through capability-based authorisation. + capability: str + + # Where the operation's resource lives. Determines the + # shape of the resource argument passed to authorise. + resource_level: str + + # Build the resource identifier from the request context. + # Returns a dict with the appropriate components for the + # resource level: {} for SYSTEM, {workspace} for WORKSPACE, + # {workspace, flow} for FLOW. Default-fill-in of workspace + # from identity.workspace happens here when applicable. + extract_resource: Callable[[RequestContext], dict] + + # Build the parameters dict — decision-relevant fields the + # operation supplied that are not part of the resource + # address. E.g. workspace association on a system-level + # user-registry operation. + extract_parameters: Callable[[RequestContext], dict] + + +# --------------------------------------------------------------------------- +# Registry storage. +# --------------------------------------------------------------------------- + + +_REGISTRY: dict[str, Operation] = {} + + +def register(op: Operation) -> None: + if op.name in _REGISTRY: + raise RuntimeError( + f"operation {op.name!r} already registered" + ) + _REGISTRY[op.name] = op + + +def lookup(name: str) -> Operation | None: + return _REGISTRY.get(name) + + +def all_operations() -> list[Operation]: + return list(_REGISTRY.values()) + + +# --------------------------------------------------------------------------- +# Common extractor helpers. +# --------------------------------------------------------------------------- + + +def _empty_resource(_ctx: RequestContext) -> dict: + """System-level resource: empty dict.""" + return {} + + +def _workspace_from_body(ctx: RequestContext) -> dict: + """Workspace-level resource sourced from the request body's + workspace field, defaulting to the caller's bound workspace.""" + ws = (ctx.body.get("workspace") if isinstance(ctx.body, dict) else "") + if not ws and ctx.identity is not None: + ws = ctx.identity.workspace + return {"workspace": ws} + + +def _flow_from_match_info(ctx: RequestContext) -> dict: + """Flow-level resource sourced from URL path components or WS + envelope fields. Both ``workspace`` and ``flow`` are required; + no default-fill-in (the address is the operation's identity).""" + return { + "workspace": ctx.match_info.get("workspace", ""), + "flow": ctx.match_info.get("flow", ""), + } + + +def _no_parameters(_ctx: RequestContext) -> dict: + return {} + + +def _body_as_parameters(ctx: RequestContext) -> dict: + """All body fields are parameters — used when the operation's + body is small and uniformly decision-relevant (e.g. user- + registry ops where the body's user.workspace is what the + regime checks against the admin's scope).""" + return dict(ctx.body) if isinstance(ctx.body, dict) else {} + + +def _workspace_param_only(ctx: RequestContext) -> dict: + """Parameters dict carrying only the workspace association. + Used by system-level operations (e.g. user-registry ops) where + the workspace isn't part of the resource address but is the + field the regime uses to scope the admin's authority. + + Pulls the workspace from the inner ``user`` / ``workspace_record`` + body field if present (create-user, create-workspace), then from + the top-level body, then from the caller's bound workspace.""" + body = ctx.body if isinstance(ctx.body, dict) else {} + inner_user = body.get("user") if isinstance(body.get("user"), dict) else {} + inner_ws = ( + body.get("workspace_record") + if isinstance(body.get("workspace_record"), dict) else {} + ) + ws = ( + inner_user.get("workspace") + or inner_ws.get("id") + or body.get("workspace") + ) + if not ws and ctx.identity is not None: + ws = ctx.identity.workspace + return {"workspace": ws or ""} + + +# --------------------------------------------------------------------------- +# Operation registrations. +# +# The gateway looks operations up by their canonical name (the same +# string the request body / WS envelope carries in its ``operation`` +# field where applicable). Auth-surface operations (login, bootstrap, +# change-password) are not listed here — they have their own routes +# in auth_endpoints.py and use PUBLIC / AUTHENTICATED sentinels +# directly. Pure gateway↔IAM internal operations (resolve-api-key, +# authorise, authorise-many, get-signing-key-public) are likewise +# excluded; they are never invoked over the public API. +# --------------------------------------------------------------------------- + + +# IAM management operations. All routed through /api/v1/iam, body +# carries ``operation`` plus operation-specific fields. + +# User registry: SYSTEM-level resource (users are global, identified +# by handle). The admin's authority is scoped per workspace via the +# parameters {workspace} field — that's what the regime checks +# against the admin's role workspace_scope. +register(Operation( + name="create-user", + capability="users:admin", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_workspace_param_only, +)) +register(Operation( + name="list-users", + capability="users:read", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_workspace_param_only, +)) +register(Operation( + name="get-user", + capability="users:read", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_workspace_param_only, +)) +register(Operation( + name="update-user", + capability="users:write", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_workspace_param_only, +)) +register(Operation( + name="disable-user", + capability="users:admin", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_workspace_param_only, +)) +register(Operation( + name="enable-user", + capability="users:admin", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_workspace_param_only, +)) +register(Operation( + name="delete-user", + capability="users:admin", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_workspace_param_only, +)) +register(Operation( + name="reset-password", + capability="users:admin", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_workspace_param_only, +)) + + +# API keys: SYSTEM-level resource — like users, a key record exists +# in the deployment-wide keys registry. The workspace the key +# authenticates to is a property of the record, not a containment; +# it appears as a parameter so the regime can scope the admin's +# authority to issue / list / revoke against it. +register(Operation( + name="create-api-key", + capability="keys:admin", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_workspace_param_only, +)) +register(Operation( + name="list-api-keys", + capability="keys:admin", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_workspace_param_only, +)) +register(Operation( + name="revoke-api-key", + capability="keys:admin", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_workspace_param_only, +)) + + +# Workspace registry: SYSTEM-level resource (workspaces are the +# top-level addressable unit). No parameters — the workspace being +# acted on is identified by the body, not used as a scope cue. +register(Operation( + name="create-workspace", + capability="workspaces:admin", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_no_parameters, +)) +register(Operation( + name="list-workspaces", + capability="workspaces:admin", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_no_parameters, +)) +register(Operation( + name="get-workspace", + capability="workspaces:admin", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_no_parameters, +)) +register(Operation( + name="update-workspace", + capability="workspaces:admin", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_no_parameters, +)) +register(Operation( + name="disable-workspace", + capability="workspaces:admin", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_no_parameters, +)) + + +# Signing key: SYSTEM-level operational op. +register(Operation( + name="rotate-signing-key", + capability="iam:admin", + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_no_parameters, +)) + + +# --------------------------------------------------------------------------- +# Auth-surface entries. +# +# Listed here so the registry is the one place the gateway looks for +# operation→capability mappings — including the sentinels for paths +# that don't go through capability-based authorisation. The actual +# routing is in auth_endpoints.py; these entries let the registry- +# driven dispatcher recognise the operation if it sees it on a +# generic path. +# --------------------------------------------------------------------------- + +register(Operation( + name="login", + capability=PUBLIC, + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_no_parameters, +)) +register(Operation( + name="bootstrap", + capability=PUBLIC, + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_no_parameters, +)) +register(Operation( + name="bootstrap-status", + capability=PUBLIC, + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_no_parameters, +)) +register(Operation( + name="change-password", + capability=AUTHENTICATED, + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_no_parameters, +)) +register(Operation( + name="whoami", + capability=AUTHENTICATED, + resource_level=ResourceLevel.SYSTEM, + extract_resource=_empty_resource, + extract_parameters=_no_parameters, +)) + + +# --------------------------------------------------------------------------- +# Generic kind/operation entries. +# +# Names are ``:`` so the registry key is unique +# across dispatchers. All entries below are workspace-level +# resources (workspace defaulted from the caller's bound workspace +# if absent). Read/write distinction maps to the existing +# ``:read`` / ``:write`` capability vocabulary +# defined in capabilities.md. +# --------------------------------------------------------------------------- + + +def _register_kind_op(kind: str, op: str, capability: str) -> None: + """Helper: register a workspace-level kind:op with the standard + extractors (workspace from body, no extra parameters).""" + register(Operation( + name=f"{kind}:{op}", + capability=capability, + resource_level=ResourceLevel.WORKSPACE, + extract_resource=_workspace_from_body, + extract_parameters=_no_parameters, + )) + + +# config: KV-style workspace config service. +for _op in ("get", "list", "getvalues", "getvalues-all-ws", "config"): + _register_kind_op("config", _op, "config:read") +for _op in ("put", "delete"): + _register_kind_op("config", _op, "config:write") + + +# flow: flow-blueprint and flow-lifecycle service. +for _op in ("list-blueprints", "get-blueprint", "list-flows", "get-flow"): + _register_kind_op("flow", _op, "flows:read") +for _op in ("put-blueprint", "delete-blueprint", "start-flow", "stop-flow"): + _register_kind_op("flow", _op, "flows:write") + + +# librarian: document storage and processing service. +for _op in ( + "get-document-metadata", "get-document-content", + "stream-document", "list-documents", "list-processing", + "get-upload-status", "list-uploads", +): + _register_kind_op("librarian", _op, "documents:read") +for _op in ( + "add-document", "remove-document", "update-document", + "add-processing", "remove-processing", + "begin-upload", "upload-chunk", "complete-upload", "abort-upload", +): + _register_kind_op("librarian", _op, "documents:write") + + +# knowledge: knowledge-graph core service. +for _op in ("get-kg-core", "list-kg-cores"): + _register_kind_op("knowledge", _op, "knowledge:read") +for _op in ("put-kg-core", "delete-kg-core", + "load-kg-core", "unload-kg-core"): + _register_kind_op("knowledge", _op, "knowledge:write") + + +# collection-management: workspace collection lifecycle. +_register_kind_op("collection-management", "list-collections", "collections:read") +for _op in ("update-collection", "delete-collection"): + _register_kind_op("collection-management", _op, "collections:write") + + +# --------------------------------------------------------------------------- +# Per-flow data-plane services. +# +# /api/v1/flow/{flow}/service/{kind} and the streaming +# /api/v1/flow/{flow}/{import,export}/{kind} paths. No body-level +# ``operation`` discriminator — the URL kind is the operation +# identity. Resource is FLOW level (workspace + flow). +# +# Names: ``flow-service:``, ``flow-import:``, +# ``flow-export:``. +# --------------------------------------------------------------------------- + + +def _register_flow_kind(prefix: str, kind: str, capability: str) -> None: + register(Operation( + name=f"{prefix}:{kind}", + capability=capability, + resource_level=ResourceLevel.FLOW, + extract_resource=_flow_from_match_info, + extract_parameters=_no_parameters, + )) + + +# Request/response services on /api/v1/flow/{flow}/service/{kind}. +_FLOW_SERVICES = { + "agent": "agent", + "text-completion": "llm", + "prompt": "llm", + "mcp-tool": "mcp", + "graph-rag": "graph:read", + "document-rag": "documents:read", + "embeddings": "embeddings", + "graph-embeddings": "graph:read", + "document-embeddings": "documents:read", + "triples": "graph:read", + "rows": "rows:read", + "nlp-query": "rows:read", + "structured-query": "rows:read", + "structured-diag": "rows:read", + "row-embeddings": "rows:read", + "sparql": "graph:read", +} +for _kind, _cap in _FLOW_SERVICES.items(): + _register_flow_kind("flow-service", _kind, _cap) + + +# Streaming import socket endpoints. +_FLOW_IMPORTS = { + "triples": "graph:write", + "graph-embeddings": "graph:write", + "document-embeddings": "documents:write", + "entity-contexts": "documents:write", + "rows": "rows:write", +} +for _kind, _cap in _FLOW_IMPORTS.items(): + _register_flow_kind("flow-import", _kind, _cap) + + +# Streaming export socket endpoints. +_FLOW_EXPORTS = { + "triples": "graph:read", + "graph-embeddings": "graph:read", + "document-embeddings": "documents:read", + "entity-contexts": "documents:read", +} +for _kind, _cap in _FLOW_EXPORTS.items(): + _register_flow_kind("flow-export", _kind, _cap) diff --git a/trustgraph-flow/trustgraph/gateway/service.py b/trustgraph-flow/trustgraph/gateway/service.py index 4e465bf7..f75f3b25 100755 --- a/trustgraph-flow/trustgraph/gateway/service.py +++ b/trustgraph-flow/trustgraph/gateway/service.py @@ -12,7 +12,7 @@ import os from trustgraph.base.logging import setup_logging, add_logging_args from trustgraph.base.pubsub import get_pubsub, add_pubsub_args -from . auth import Authenticator +from . auth import IamAuth from . config.receiver import ConfigReceiver from . dispatch.manager import DispatcherManager @@ -35,7 +35,6 @@ default_prometheus_url = os.getenv("PROMETHEUS_URL", "http://prometheus:9090") default_pulsar_api_key = os.getenv("PULSAR_API_KEY", None) default_timeout = 600 default_port = 8088 -default_api_token = os.getenv("GATEWAY_SECRET", "") class Api: @@ -60,13 +59,14 @@ class Api: if not self.prometheus_url.endswith("/"): self.prometheus_url += "/" - api_token = config.get("api_token", default_api_token) - - # Token not set, or token equal empty string means no auth - if api_token: - self.auth = Authenticator(token=api_token) - else: - self.auth = Authenticator(allow_all=True) + # IAM-backed authentication. The legacy GATEWAY_SECRET + # shared-token path has been removed — there is no + # "open for everyone" fallback. The gateway cannot + # authenticate any request until IAM is reachable. + self.auth = IamAuth( + backend=self.pubsub_backend, + id=config.get("id", "api-gateway"), + ) self.config_receiver = ConfigReceiver(self.pubsub_backend) @@ -118,6 +118,7 @@ class Api: config_receiver = self.config_receiver, prefix = "gateway", queue_overrides = queue_overrides, + auth = self.auth, ) self.endpoint_manager = EndpointManager( @@ -132,12 +133,18 @@ class Api: ] async def app_factory(self): - + self.app = web.Application( middlewares=[], client_max_size=256 * 1024 * 1024 ) + # Fetch IAM signing public key before accepting traffic. + # Blocks for a bounded retry window; the gateway starts even + # if IAM is still unreachable (JWT validation will 401 until + # the key is available). + await self.auth.start() + await self.config_receiver.start() for ep in self.endpoints: @@ -189,12 +196,6 @@ def run(): help=f'API request timeout in seconds (default: {default_timeout})', ) - parser.add_argument( - '--api-token', - default=default_api_token, - help=f'Secret API token (default: no auth)', - ) - add_logging_args(parser) parser.add_argument( diff --git a/trustgraph-flow/trustgraph/iam/__init__.py b/trustgraph-flow/trustgraph/iam/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/trustgraph-flow/trustgraph/iam/service/__init__.py b/trustgraph-flow/trustgraph/iam/service/__init__.py new file mode 100644 index 00000000..98f4d9da --- /dev/null +++ b/trustgraph-flow/trustgraph/iam/service/__init__.py @@ -0,0 +1 @@ +from . service import * diff --git a/trustgraph-flow/trustgraph/iam/service/__main__.py b/trustgraph-flow/trustgraph/iam/service/__main__.py new file mode 100644 index 00000000..a731dd63 --- /dev/null +++ b/trustgraph-flow/trustgraph/iam/service/__main__.py @@ -0,0 +1,4 @@ + +from . service import run + +run() diff --git a/trustgraph-flow/trustgraph/iam/service/iam.py b/trustgraph-flow/trustgraph/iam/service/iam.py new file mode 100644 index 00000000..c89f65b0 --- /dev/null +++ b/trustgraph-flow/trustgraph/iam/service/iam.py @@ -0,0 +1,1358 @@ +""" +IAM business logic. Handles ``IamRequest`` messages and builds +``IamResponse`` messages. Does not concern itself with transport. + +See docs/tech-specs/iam-protocol.md for the wire-level contract and +docs/tech-specs/iam.md for the surrounding architecture. +""" + +import asyncio +import base64 +import datetime +import hashlib +import json +import logging +import os +import secrets +import uuid + +from cryptography.hazmat.primitives import serialization +from cryptography.hazmat.primitives.asymmetric import ed25519 + +from trustgraph.schema import ( + IamResponse, Error, + UserRecord, WorkspaceRecord, ApiKeyRecord, +) + +from ... tables.iam import IamTableStore + +logger = logging.getLogger(__name__) + + +DEFAULT_WORKSPACE = "default" +BOOTSTRAP_ADMIN_USERNAME = "admin" +BOOTSTRAP_ADMIN_NAME = "Administrator" + +PBKDF2_ITERATIONS = 600_000 +API_KEY_PREFIX = "tg_" +API_KEY_RANDOM_BYTES = 24 + +JWT_ISSUER = "trustgraph-iam" +JWT_TTL_SECONDS = 3600 + +# Default authorisation cache TTL the regime tells the gateway to +# observe. 60s is the OSS-spec maximum revocation latency: a role +# change, workspace disable, or key revoke takes effect within at +# most this much time. +AUTHZ_CACHE_TTL_SECONDS = 60 + + +# OSS regime role table. Lives here, not in the gateway — the +# gateway is regime-agnostic and must not encode policy. +# +# Each role has a capability set and a workspace scope. The +# evaluator (handle_authorise below) checks (a) that some role +# held by the caller grants the requested capability, and (b) +# that role's workspace scope permits the target workspace. + +_READER_CAPS = { + "agent", + "graph:read", + "documents:read", + "rows:read", + "llm", + "embeddings", + "mcp", + "config:read", + "flows:read", + "collections:read", + "knowledge:read", + "keys:self", +} + +_WRITER_CAPS = _READER_CAPS | { + "graph:write", + "documents:write", + "rows:write", + "collections:write", + "knowledge:write", +} + +_ADMIN_CAPS = _WRITER_CAPS | { + "config:write", + "flows:write", + "users:read", "users:write", "users:admin", + "keys:admin", + "workspaces:admin", + "iam:admin", + "metrics:read", +} + +ROLE_DEFINITIONS = { + "reader": { + "capabilities": _READER_CAPS, + "workspace_scope": "assigned", + }, + "writer": { + "capabilities": _WRITER_CAPS, + "workspace_scope": "assigned", + }, + "admin": { + "capabilities": _ADMIN_CAPS, + "workspace_scope": "*", + }, +} + + +def _scope_permits(role_scope, target_workspace, assigned_workspace): + """Does the given role apply to ``target_workspace``?""" + if role_scope == "*": + return True + if role_scope == "assigned": + return target_workspace == assigned_workspace + return False + + +def _now_iso(): + return datetime.datetime.now(datetime.timezone.utc).isoformat() + + +def _now_dt(): + return datetime.datetime.now(datetime.timezone.utc) + + +def _iso(dt): + if dt is None: + return "" + if isinstance(dt, str): + return dt + if dt.tzinfo is None: + dt = dt.replace(tzinfo=datetime.timezone.utc) + return dt.isoformat() + + +def _hash_password(password): + """Return an encoded PBKDF2-SHA-256 hash of ``password``. + + Format: ``pbkdf2-sha256$$$``. Stored + verbatim in the password_hash column so the algorithm and cost + can be evolved later (new rows get a new prefix; old rows are + verified with their own parameters). + """ + salt = os.urandom(16) + dk = hashlib.pbkdf2_hmac( + "sha256", password.encode("utf-8"), salt, PBKDF2_ITERATIONS, + ) + return ( + f"pbkdf2-sha256${PBKDF2_ITERATIONS}" + f"${base64.b64encode(salt).decode('ascii')}" + f"${base64.b64encode(dk).decode('ascii')}" + ) + + +def _verify_password(password, encoded): + """Constant-time verify ``password`` against an encoded hash.""" + try: + algo, iters, b64_salt, b64_hash = encoded.split("$") + except ValueError: + return False + if algo != "pbkdf2-sha256": + return False + try: + iters = int(iters) + salt = base64.b64decode(b64_salt) + target = base64.b64decode(b64_hash) + except Exception: + return False + dk = hashlib.pbkdf2_hmac( + "sha256", password.encode("utf-8"), salt, iters, + ) + return secrets.compare_digest(dk, target) + + +def _generate_api_key(): + """Return a fresh API-key plaintext of the form ``tg_``.""" + return API_KEY_PREFIX + secrets.token_urlsafe(API_KEY_RANDOM_BYTES) + + +def _hash_api_key(plaintext): + """SHA-256 hex digest of an API key plaintext. Used as the + primary key in ``iam_api_keys`` so ``resolve-api-key`` is O(1).""" + return hashlib.sha256(plaintext.encode("utf-8")).hexdigest() + + +def _err(type, message): + return IamResponse(error=Error(type=type, message=message)) + + +def _parse_expires(s): + if not s: + return None + try: + return datetime.datetime.fromisoformat(s) + except Exception: + return None + + +def _b64url(data): + """URL-safe base64 encode without padding, as required by JWT.""" + return base64.urlsafe_b64encode(data).rstrip(b"=").decode("ascii") + + +def _generate_signing_keypair(): + """Return (kid, private_pem, public_pem) for a fresh Ed25519 + keypair. Ed25519 / EdDSA: small (32-byte public key), fast, + deterministic, side-channel-resistant by construction, free of + NIST-curve baggage.""" + key = ed25519.Ed25519PrivateKey.generate() + private_pem = key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.PKCS8, + encryption_algorithm=serialization.NoEncryption(), + ).decode("ascii") + public_pem = key.public_key().public_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PublicFormat.SubjectPublicKeyInfo, + ).decode("ascii") + kid = uuid.uuid4().hex[:16] + return kid, private_pem, public_pem + + +def _sign_jwt(kid, private_pem, claims): + """Produce a compact-serialisation EdDSA (Ed25519) JWT for + ``claims``.""" + key = serialization.load_pem_private_key( + private_pem.encode("ascii"), password=None, + ) + if not isinstance(key, ed25519.Ed25519PrivateKey): + raise RuntimeError( + f"signing key is not Ed25519: {type(key).__name__}" + ) + + header = {"alg": "EdDSA", "typ": "JWT", "kid": kid} + header_b = _b64url(json.dumps( + header, separators=(",", ":"), sort_keys=True, + ).encode("utf-8")) + payload_b = _b64url(json.dumps( + claims, separators=(",", ":"), sort_keys=True, + ).encode("utf-8")) + signing_input = f"{header_b}.{payload_b}".encode("ascii") + signature = key.sign(signing_input) + + return f"{header_b}.{payload_b}.{_b64url(signature)}" + + +class IamService: + + def __init__(self, host, username, password, keyspace, + bootstrap_mode, bootstrap_token=None): + self.table_store = IamTableStore( + host, username, password, keyspace, + ) + # bootstrap_mode: "token" or "bootstrap". In "token" mode the + # service auto-seeds on first start using the provided + # bootstrap_token and the ``bootstrap`` operation is refused + # thereafter (indistinguishable from an already-bootstrapped + # deployment per the error policy). In "bootstrap" mode the + # ``bootstrap`` operation is live until tables are populated. + if bootstrap_mode not in ("token", "bootstrap"): + raise ValueError( + f"bootstrap_mode must be 'token' or 'bootstrap', " + f"got {bootstrap_mode!r}" + ) + if bootstrap_mode == "token" and not bootstrap_token: + raise ValueError( + "bootstrap_mode='token' requires bootstrap_token" + ) + self.bootstrap_mode = bootstrap_mode + self.bootstrap_token = bootstrap_token + + self._signing_key = None + self._signing_key_lock = asyncio.Lock() + + # ------------------------------------------------------------------ + # Dispatch + # ------------------------------------------------------------------ + + async def handle(self, v): + op = v.operation + + try: + if op == "bootstrap": + return await self.handle_bootstrap(v) + if op == "bootstrap-status": + return await self.handle_bootstrap_status(v) + if op == "whoami": + return await self.handle_whoami(v) + if op == "resolve-api-key": + return await self.handle_resolve_api_key(v) + if op == "create-user": + return await self.handle_create_user(v) + if op == "list-users": + return await self.handle_list_users(v) + if op == "create-api-key": + return await self.handle_create_api_key(v) + if op == "list-api-keys": + return await self.handle_list_api_keys(v) + if op == "revoke-api-key": + return await self.handle_revoke_api_key(v) + if op == "login": + return await self.handle_login(v) + if op == "get-signing-key-public": + return await self.handle_get_signing_key_public(v) + if op == "change-password": + return await self.handle_change_password(v) + if op == "reset-password": + return await self.handle_reset_password(v) + if op == "get-user": + return await self.handle_get_user(v) + if op == "update-user": + return await self.handle_update_user(v) + if op == "disable-user": + return await self.handle_disable_user(v) + if op == "enable-user": + return await self.handle_enable_user(v) + if op == "delete-user": + return await self.handle_delete_user(v) + if op == "create-workspace": + return await self.handle_create_workspace(v) + if op == "list-workspaces": + return await self.handle_list_workspaces(v) + if op == "get-workspace": + return await self.handle_get_workspace(v) + if op == "update-workspace": + return await self.handle_update_workspace(v) + if op == "disable-workspace": + return await self.handle_disable_workspace(v) + if op == "rotate-signing-key": + return await self.handle_rotate_signing_key(v) + if op == "authorise": + return await self.handle_authorise(v) + if op == "authorise-many": + return await self.handle_authorise_many(v) + + return _err( + "invalid-argument", + f"unknown or not-yet-implemented operation: {op!r}", + ) + + except Exception as e: + logger.error( + f"IAM {op} failed: {type(e).__name__}: {e}", + exc_info=True, + ) + return _err("internal-error", str(e)) + + # ------------------------------------------------------------------ + # Record conversion + # ------------------------------------------------------------------ + + def _row_to_user_record(self, row): + ( + id, workspace, username, name, email, _password_hash, + roles, enabled, must_change_password, created, + ) = row + return UserRecord( + id=id or "", + workspace=workspace or "", + username=username or "", + name=name or "", + email=email or "", + roles=sorted(roles) if roles else [], + enabled=bool(enabled), + must_change_password=bool(must_change_password), + created=_iso(created), + ) + + def _row_to_api_key_record(self, row): + ( + _key_hash, id, user_id, name, prefix, expires, + created, last_used, + ) = row + return ApiKeyRecord( + id=id or "", + user_id=user_id or "", + name=name or "", + prefix=prefix or "", + expires=_iso(expires), + created=_iso(created), + last_used=_iso(last_used), + ) + + # ------------------------------------------------------------------ + # bootstrap + # ------------------------------------------------------------------ + + async def auto_bootstrap_if_token_mode(self): + """Called from the service processor at startup. In + ``token`` mode, if tables are empty, seeds the default + workspace / admin / signing key using the operator-provided + bootstrap token. The admin's API key plaintext is *the* + ``bootstrap_token`` — the operator already knows it, nothing + needs to be returned or logged. + + In ``bootstrap`` mode this is a no-op; seeding happens on + explicit ``bootstrap`` operation invocation.""" + if self.bootstrap_mode != "token": + return + + if await self.table_store.any_workspace_exists(): + logger.info( + "IAM: token mode, tables already populated; skipping " + "auto-bootstrap" + ) + return + + logger.info("IAM: token mode, empty tables; auto-bootstrapping") + await self._seed_tables(self.bootstrap_token) + logger.info( + "IAM: auto-bootstrap complete using operator-provided token" + ) + + async def _seed_tables(self, api_key_plaintext): + """Shared seeding logic used by token-mode auto-bootstrap and + bootstrap-mode handle_bootstrap. Creates the default + workspace, admin user, admin API key (using the given + plaintext), and an initial signing key. Returns the admin + user id.""" + now = _now_dt() + + await self.table_store.put_workspace( + id=DEFAULT_WORKSPACE, + name="Default", + enabled=True, + created=now, + ) + + admin_user_id = str(uuid.uuid4()) + admin_password = secrets.token_urlsafe(32) + await self.table_store.put_user( + id=admin_user_id, + workspace=DEFAULT_WORKSPACE, + username=BOOTSTRAP_ADMIN_USERNAME, + name=BOOTSTRAP_ADMIN_NAME, + email="", + password_hash=_hash_password(admin_password), + roles=["admin"], + enabled=True, + must_change_password=True, + created=now, + ) + + key_id = str(uuid.uuid4()) + await self.table_store.put_api_key( + key_hash=_hash_api_key(api_key_plaintext), + id=key_id, + user_id=admin_user_id, + name="bootstrap", + prefix=api_key_plaintext[:len(API_KEY_PREFIX) + 4], + expires=None, + created=now, + last_used=None, + ) + + kid, private_pem, public_pem = _generate_signing_keypair() + await self.table_store.put_signing_key( + kid=kid, + private_pem=private_pem, + public_pem=public_pem, + created=now, + retired=None, + ) + self._signing_key = (kid, private_pem, public_pem) + + logger.info( + f"IAM seeded: workspace={DEFAULT_WORKSPACE!r}, " + f"admin user_id={admin_user_id}, signing key kid={kid}" + ) + return admin_user_id + + async def handle_bootstrap(self, v): + """Explicit bootstrap op. Only available in ``bootstrap`` + mode and only when tables are empty. Every other case is + masked to a generic auth failure — the caller cannot + distinguish 'not in bootstrap mode' from 'already + bootstrapped' from 'operation forbidden'.""" + if self.bootstrap_mode != "bootstrap": + return _err("auth-failed", "auth failure") + + if await self.table_store.any_workspace_exists(): + return _err("auth-failed", "auth failure") + + plaintext = _generate_api_key() + admin_user_id = await self._seed_tables(plaintext) + + return IamResponse( + bootstrap_admin_user_id=admin_user_id, + bootstrap_admin_api_key=plaintext, + ) + + async def handle_whoami(self, v): + """Return the caller's own user record. ``v.actor`` is the + authenticated identity's handle (the gateway populates it + from ``identity.handle``). No ``users:read`` capability + required — every authenticated user can read themselves.""" + if not v.actor: + return _err( + "invalid-argument", + "actor required (gateway should populate this)", + ) + user_row = await self.table_store.get_user(v.actor) + if user_row is None: + return _err("not-found", "user not found") + return IamResponse(user=self._row_to_user_record(user_row)) + + async def handle_bootstrap_status(self, v): + """Probe op: returns whether the deployment is currently in + the unconsumed-bootstrap state (i.e. ``bootstrap`` mode with + empty tables, where an explicit ``bootstrap`` call would + succeed). PUBLIC so a UI can decide whether to render the + first-run setup flow without invoking the side-effectful + ``bootstrap`` op. + + The information leaked is intentionally narrow: an empty + deployment in bootstrap mode is already inferable (no users, + no logins succeed); this just makes the answer explicit + instead of forcing callers to probe the masked-failure path.""" + available = ( + self.bootstrap_mode == "bootstrap" + and not await self.table_store.any_workspace_exists() + ) + return IamResponse(bootstrap_available=available) + + # ------------------------------------------------------------------ + # Signing key helpers + # ------------------------------------------------------------------ + + async def _get_active_signing_key(self): + """Return ``(kid, private_pem, public_pem)`` for the active + signing key. Loads from Cassandra on first call. Generates + and persists a new key if none exists — covers the case where + ``login`` is called before ``bootstrap`` (shouldn't happen in + practice but keeps the service internally consistent).""" + if self._signing_key is not None: + return self._signing_key + + async with self._signing_key_lock: + if self._signing_key is not None: + return self._signing_key + + rows = await self.table_store.list_signing_keys() + active = [r for r in rows if r[4] is None] + + if active: + row = active[0] + self._signing_key = (row[0], row[1], row[2]) + logger.info( + f"IAM: loaded active signing key kid={row[0]}" + ) + return self._signing_key + + kid, private_pem, public_pem = _generate_signing_keypair() + await self.table_store.put_signing_key( + kid=kid, + private_pem=private_pem, + public_pem=public_pem, + created=_now_dt(), + retired=None, + ) + self._signing_key = (kid, private_pem, public_pem) + logger.info( + f"IAM: generated active signing key kid={kid} " + f"(no existing key found)" + ) + return self._signing_key + + # ------------------------------------------------------------------ + # login + # ------------------------------------------------------------------ + + async def handle_login(self, v): + if not v.username: + return _err("auth-failed", "username required") + if not v.password: + return _err("auth-failed", "password required") + + # Login accepts an optional workspace parameter. If omitted + # we use the default workspace (OSS single-workspace + # assumption). Multi-workspace enterprise editions swap in a + # resolver that looks across the caller's permitted set. + workspace = v.workspace or DEFAULT_WORKSPACE + + user_id = await self.table_store.get_user_id_by_username( + workspace, v.username, + ) + if not user_id: + return _err("auth-failed", "no such user") + + user_row = await self.table_store.get_user(user_id) + if user_row is None: + return _err("auth-failed", "user disappeared") + + ( + id, ws, _username, _name, _email, password_hash, + _roles, enabled, _mcp, _created, + ) = user_row + + if not enabled: + return _err("auth-failed", "user disabled") + if not password_hash or not _verify_password( + v.password, password_hash, + ): + return _err("auth-failed", "bad credentials") + + ws_row = await self.table_store.get_workspace(ws) + if ws_row is None or not ws_row[2]: + return _err("auth-failed", "workspace disabled") + + kid, private_pem, _ = await self._get_active_signing_key() + + now_ts = int(_now_dt().timestamp()) + exp_ts = now_ts + JWT_TTL_SECONDS + # Per the IAM contract the gateway never reads policy state + # from the credential — roles stay server-side, reachable + # only via authorise(). JWT carries identity + workspace + # binding only. + claims = { + "iss": JWT_ISSUER, + "sub": id, + "workspace": ws, + "iat": now_ts, + "exp": exp_ts, + } + token = _sign_jwt(kid, private_pem, claims) + + expires_iso = datetime.datetime.fromtimestamp( + exp_ts, tz=datetime.timezone.utc, + ).isoformat() + + return IamResponse(jwt=token, jwt_expires=expires_iso) + + # ------------------------------------------------------------------ + # get-signing-key-public + # ------------------------------------------------------------------ + + async def handle_get_signing_key_public(self, v): + _, _, public_pem = await self._get_active_signing_key() + return IamResponse(signing_key_public=public_pem) + + # ------------------------------------------------------------------ + # Record-conversion helper for workspaces + # ------------------------------------------------------------------ + + def _row_to_workspace_record(self, row): + id, name, enabled, created = row + return WorkspaceRecord( + id=id or "", + name=name or "", + enabled=bool(enabled), + created=_iso(created), + ) + + async def _resolve_user(self, user_id, workspace=None): + """Return (user_row, error_response_or_None). Loads the user + record by id and (when ``workspace`` is supplied) verifies the + record's home workspace matches. + + Workspace is an *optional integrity check* — the user record + is system-level, identified by id alone. If the caller asserts + a workspace, we verify; if they omit it, we just return the + record. Authorisation (whether the caller is permitted to + operate on this user) is the gateway's responsibility via the + contract's ``authorise`` call before the handler is reached. + """ + user_row = await self.table_store.get_user(user_id) + if user_row is None: + return None, _err("not-found", "user not found") + if workspace and user_row[1] != workspace: + return None, _err( + "operation-not-permitted", + "user is in a different workspace", + ) + return user_row, None + + # ------------------------------------------------------------------ + # change-password + # ------------------------------------------------------------------ + + async def handle_change_password(self, v): + if not v.user_id: + return _err("invalid-argument", "user_id required") + if not v.password: + return _err("invalid-argument", "password (current) required") + if not v.new_password: + return _err("invalid-argument", "new_password required") + + user_row = await self.table_store.get_user(v.user_id) + if user_row is None: + return _err("auth-failed", "no such user") + + _id, _ws, _un, _name, _email, password_hash, _r, enabled, _mcp, _c = ( + user_row + ) + if not enabled: + return _err("auth-failed", "user disabled") + if not password_hash or not _verify_password( + v.password, password_hash, + ): + return _err("auth-failed", "bad credentials") + + await self.table_store.update_user_password( + id=v.user_id, + password_hash=_hash_password(v.new_password), + must_change_password=False, + ) + return IamResponse() + + # ------------------------------------------------------------------ + # reset-password + # ------------------------------------------------------------------ + + async def handle_reset_password(self, v): + if not v.user_id: + return _err("invalid-argument", "user_id required") + + _, err = await self._resolve_user(v.user_id, v.workspace or None) + if err is not None: + return err + + temporary = secrets.token_urlsafe(12) + await self.table_store.update_user_password( + id=v.user_id, + password_hash=_hash_password(temporary), + must_change_password=True, + ) + return IamResponse(temporary_password=temporary) + + # ------------------------------------------------------------------ + # get-user / update-user / disable-user + # ------------------------------------------------------------------ + + async def handle_get_user(self, v): + if not v.user_id: + return _err("invalid-argument", "user_id required") + + user_row, err = await self._resolve_user( + v.user_id, v.workspace or None, + ) + if err is not None: + return err + return IamResponse(user=self._row_to_user_record(user_row)) + + async def handle_update_user(self, v): + """Update user profile fields: name, email, roles, enabled, + must_change_password. Username is immutable — change it by + creating a new user and disabling the old one. Password + changes go through change-password / reset-password.""" + if not v.user_id: + return _err("invalid-argument", "user_id required") + if v.user is None: + return _err("invalid-argument", "user field required") + if v.user.password: + return _err( + "invalid-argument", + "password cannot be changed via update-user; " + "use change-password or reset-password", + ) + + existing, err = await self._resolve_user( + v.user_id, v.workspace or None, + ) + if err is not None: + return err + if v.user.username and v.user.username != existing[2]: + return _err( + "invalid-argument", + "username is immutable; create a new user instead", + ) + + # Carry forward fields the caller didn't provide. + ( + _id, _ws, _username, cur_name, cur_email, _pw, + cur_roles, cur_enabled, cur_mcp, _created, + ) = existing + + new_name = v.user.name if v.user.name else cur_name + new_email = v.user.email if v.user.email else cur_email + new_roles = list(v.user.roles) if v.user.roles else list( + cur_roles or [], + ) + new_enabled = v.user.enabled if v.user.enabled is not None else ( + cur_enabled + ) + new_mcp = ( + v.user.must_change_password + if v.user.must_change_password is not None + else cur_mcp + ) + + await self.table_store.update_user_profile( + id=v.user_id, + name=new_name, + email=new_email, + roles=new_roles, + enabled=new_enabled, + must_change_password=new_mcp, + ) + + updated = await self.table_store.get_user(v.user_id) + return IamResponse(user=self._row_to_user_record(updated)) + + async def handle_disable_user(self, v): + """Soft-delete: set enabled=false and revoke every API key + belonging to the user.""" + if not v.user_id: + return _err("invalid-argument", "user_id required") + + _, err = await self._resolve_user(v.user_id, v.workspace or None) + if err is not None: + return err + + await self.table_store.update_user_enabled( + id=v.user_id, enabled=False, + ) + + # Revoke all their API keys. + key_rows = await self.table_store.list_api_keys_by_user(v.user_id) + for kr in key_rows: + await self.table_store.delete_api_key(kr[0]) + + return IamResponse() + + async def handle_enable_user(self, v): + """Re-enable a previously disabled user. Does not restore + API keys — those have to be re-issued by the admin.""" + if not v.user_id: + return _err("invalid-argument", "user_id required") + + _, err = await self._resolve_user(v.user_id, v.workspace or None) + if err is not None: + return err + + await self.table_store.update_user_enabled( + id=v.user_id, enabled=True, + ) + return IamResponse() + + async def handle_delete_user(self, v): + """Hard-delete a user. Removes the ``iam_users`` row, the + ``iam_users_by_username`` lookup row, and every API key + belonging to the user. + + Unlike disable, this frees the username for re-use and + removes the user's personal data from storage (intended to + cover GDPR erasure-style requirements). When audit logging + lands, the decision to delete vs. anonymise referenced audit + rows will need to be revisited.""" + if not v.user_id: + return _err("invalid-argument", "user_id required") + + user_row, err = await self._resolve_user( + v.user_id, v.workspace or None, + ) + if err is not None: + return err + + # user_row indices match get_user columns. Username is [2]. + username = user_row[2] + record_workspace = user_row[1] + + # Revoke all API keys. + key_rows = await self.table_store.list_api_keys_by_user(v.user_id) + for kr in key_rows: + await self.table_store.delete_api_key(kr[0]) + + # Remove username lookup — keyed on (workspace, username), + # so use the resolved workspace from the user record rather + # than relying on the caller-supplied filter. + if username: + await self.table_store.delete_username_lookup( + record_workspace, username, + ) + + # Remove user record. + await self.table_store.delete_user(v.user_id) + + return IamResponse() + + # ------------------------------------------------------------------ + # Workspace CRUD + # ------------------------------------------------------------------ + + async def handle_create_workspace(self, v): + if v.workspace_record is None or not v.workspace_record.id: + return _err( + "invalid-argument", + "workspace_record.id required for create-workspace", + ) + if v.workspace_record.id.startswith("_"): + return _err( + "invalid-argument", + "workspace ids beginning with '_' are reserved", + ) + + existing = await self.table_store.get_workspace( + v.workspace_record.id, + ) + if existing is not None: + return _err("duplicate", "workspace already exists") + + now = _now_dt() + await self.table_store.put_workspace( + id=v.workspace_record.id, + name=v.workspace_record.name or v.workspace_record.id, + enabled=v.workspace_record.enabled, + created=now, + ) + row = await self.table_store.get_workspace(v.workspace_record.id) + return IamResponse(workspace=self._row_to_workspace_record(row)) + + async def handle_list_workspaces(self, v): + rows = await self.table_store.list_workspaces() + return IamResponse( + workspaces=[ + self._row_to_workspace_record(r) for r in rows + ], + ) + + async def handle_get_workspace(self, v): + if v.workspace_record is None or not v.workspace_record.id: + return _err("invalid-argument", "workspace_record.id required") + row = await self.table_store.get_workspace(v.workspace_record.id) + if row is None: + return _err("not-found", "workspace not found") + return IamResponse(workspace=self._row_to_workspace_record(row)) + + async def handle_update_workspace(self, v): + """Update workspace name / enabled. The id is immutable.""" + if v.workspace_record is None or not v.workspace_record.id: + return _err("invalid-argument", "workspace_record.id required") + row = await self.table_store.get_workspace(v.workspace_record.id) + if row is None: + return _err("not-found", "workspace not found") + + _, cur_name, cur_enabled, _created = row + new_name = ( + v.workspace_record.name + if v.workspace_record.name else cur_name + ) + new_enabled = ( + v.workspace_record.enabled + if v.workspace_record.enabled is not None + else cur_enabled + ) + + await self.table_store.update_workspace( + id=v.workspace_record.id, + name=new_name, + enabled=new_enabled, + ) + updated = await self.table_store.get_workspace( + v.workspace_record.id, + ) + return IamResponse( + workspace=self._row_to_workspace_record(updated), + ) + + async def handle_disable_workspace(self, v): + """Set enabled=false, disable every user in the workspace, + revoke every API key belonging to those users.""" + if v.workspace_record is None or not v.workspace_record.id: + return _err("invalid-argument", "workspace_record.id required") + + row = await self.table_store.get_workspace(v.workspace_record.id) + if row is None: + return _err("not-found", "workspace not found") + + await self.table_store.update_workspace( + id=v.workspace_record.id, + name=row[1] or v.workspace_record.id, + enabled=False, + ) + + user_rows = await self.table_store.list_users_by_workspace( + v.workspace_record.id, + ) + for ur in user_rows: + user_id = ur[0] + await self.table_store.update_user_enabled( + id=user_id, enabled=False, + ) + key_rows = await self.table_store.list_api_keys_by_user(user_id) + for kr in key_rows: + await self.table_store.delete_api_key(kr[0]) + + return IamResponse() + + # ------------------------------------------------------------------ + # rotate-signing-key + # ------------------------------------------------------------------ + + async def handle_rotate_signing_key(self, v): + """Create a new Ed25519 signing key, retire the current + active key, switch the in-memory cache over. + + The retired key row is kept in ``iam_signing_keys`` so the + gateway's JWT validator can continue to validate previously- + issued tokens during the grace period. Actual grace-period + enforcement (time-window acceptance at the validator) lands + with the gateway auth middleware work.""" + + # Retire the currently-active key, if any. + current = await self._get_active_signing_key() + now = _now_dt() + if current is not None: + cur_kid, _cur_priv, _cur_pub = current + await self.table_store.retire_signing_key( + kid=cur_kid, retired=now, + ) + + new_kid, new_priv, new_pub = _generate_signing_keypair() + await self.table_store.put_signing_key( + kid=new_kid, + private_pem=new_priv, + public_pem=new_pub, + created=now, + retired=None, + ) + self._signing_key = (new_kid, new_priv, new_pub) + logger.info( + f"IAM: rotated signing key. " + f"New kid={new_kid}, retired kid={(current or (None,))[0]}" + ) + return IamResponse() + + # ------------------------------------------------------------------ + # resolve-api-key + # ------------------------------------------------------------------ + + async def handle_resolve_api_key(self, v): + if not v.api_key: + return _err("auth-failed", "no api key") + + row = await self.table_store.get_api_key_by_hash( + _hash_api_key(v.api_key), + ) + if row is None: + return _err("auth-failed", "unknown api key") + + ( + _key_hash, _id, user_id, _name, _prefix, expires, + _created, _last_used, + ) = row + + if expires is not None: + exp_dt = expires + if isinstance(exp_dt, str): + exp_dt = datetime.datetime.fromisoformat(exp_dt) + if exp_dt.tzinfo is None: + exp_dt = exp_dt.replace(tzinfo=datetime.timezone.utc) + if exp_dt < _now_dt(): + return _err("auth-failed", "api key expired") + + user_row = await self.table_store.get_user(user_id) + if user_row is None: + return _err("auth-failed", "owning user missing") + user = self._row_to_user_record(user_row) + if not user.enabled: + return _err("auth-failed", "owning user disabled") + + # Workspace-disabled check. + ws_row = await self.table_store.get_workspace(user.workspace) + if ws_row is None or not ws_row[2]: + return _err("auth-failed", "owning workspace disabled") + + return IamResponse( + resolved_user_id=user.id, + resolved_workspace=user.workspace, + resolved_roles=list(user.roles), + ) + + # ------------------------------------------------------------------ + # create-user + # ------------------------------------------------------------------ + + async def handle_create_user(self, v): + if not v.workspace: + return _err( + "invalid-argument", "workspace required for create-user", + ) + if v.user is None: + return _err( + "invalid-argument", "user field required for create-user", + ) + if not v.user.username: + return _err("invalid-argument", "user.username required") + if not v.user.password: + return _err("invalid-argument", "user.password required") + + # Workspace must exist and be enabled. + ws = await self.table_store.get_workspace(v.workspace) + if ws is None or not ws[2]: + return _err("not-found", "workspace not found or disabled") + + # Uniqueness on username within workspace. + existing = await self.table_store.get_user_id_by_username( + v.workspace, v.user.username, + ) + if existing: + return _err("duplicate", "username already exists") + + user_id = str(uuid.uuid4()) + now = _now_dt() + + await self.table_store.put_user( + id=user_id, + workspace=v.workspace, + username=v.user.username, + name=v.user.name or v.user.username, + email=v.user.email or "", + password_hash=_hash_password(v.user.password), + roles=list(v.user.roles or []), + enabled=v.user.enabled, + must_change_password=v.user.must_change_password, + created=now, + ) + + row = await self.table_store.get_user(user_id) + return IamResponse(user=self._row_to_user_record(row)) + + # ------------------------------------------------------------------ + # list-users + # ------------------------------------------------------------------ + + async def handle_list_users(self, v): + # System-level operation: workspace, when supplied, is a + # filter on the user record's home-workspace association. + # Empty workspace returns the deployment-wide list — the + # gateway has already authorised the caller's authority to + # see that scope. + if v.workspace: + rows = await self.table_store.list_users_by_workspace(v.workspace) + else: + rows = await self.table_store.list_users() + return IamResponse( + users=[self._row_to_user_record(r) for r in rows], + ) + + # ------------------------------------------------------------------ + # create-api-key + # ------------------------------------------------------------------ + + async def handle_create_api_key(self, v): + if v.key is None or not v.key.user_id: + return _err("invalid-argument", "key.user_id required") + if not v.key.name: + return _err("invalid-argument", "key.name required") + + # API keys are system-level records with a workspace + # association (the user's home workspace). Workspace is an + # optional integrity check on the caller's request — when + # supplied it must match the target user's home workspace; + # when omitted, the user's home workspace is used. + user_row, err = await self._resolve_user( + v.key.user_id, v.workspace or None, + ) + if err is not None: + return err + + plaintext = _generate_api_key() + key_id = str(uuid.uuid4()) + now = _now_dt() + expires_dt = _parse_expires(v.key.expires) + + await self.table_store.put_api_key( + key_hash=_hash_api_key(plaintext), + id=key_id, + user_id=v.key.user_id, + name=v.key.name, + prefix=plaintext[:len(API_KEY_PREFIX) + 4], + expires=expires_dt, + created=now, + last_used=None, + ) + + row = await self.table_store.get_api_key_by_hash( + _hash_api_key(plaintext), + ) + return IamResponse( + api_key_plaintext=plaintext, + api_key=self._row_to_api_key_record(row), + ) + + # ------------------------------------------------------------------ + # list-api-keys + # ------------------------------------------------------------------ + + async def handle_list_api_keys(self, v): + if not v.user_id: + return _err( + "invalid-argument", "user_id required for list-api-keys", + ) + + # Workspace is an optional integrity check. + _, err = await self._resolve_user(v.user_id, v.workspace or None) + if err is not None: + return err + + rows = await self.table_store.list_api_keys_by_user(v.user_id) + return IamResponse( + api_keys=[self._row_to_api_key_record(r) for r in rows], + ) + + # ------------------------------------------------------------------ + # revoke-api-key + # ------------------------------------------------------------------ + + async def handle_revoke_api_key(self, v): + if not v.key_id: + return _err("invalid-argument", "key_id required") + + row = await self.table_store.get_api_key_by_id(v.key_id) + if row is None: + return _err("not-found", "api key not found") + + key_hash, _id, user_id, _name, _prefix, _expires, _c, _lu = row + + # Workspace is an optional integrity check via the owning user. + if v.workspace: + user_row = await self.table_store.get_user(user_id) + if user_row is None or user_row[1] != v.workspace: + return _err( + "operation-not-permitted", + "key belongs to a different workspace", + ) + + await self.table_store.delete_api_key(key_hash) + return IamResponse() + + # ------------------------------------------------------------------ + # authorise / authorise-many + # + # The IAM contract (see docs/tech-specs/iam-contract.md) calls + # for the regime — not the gateway — to decide whether an + # identity may perform a capability on a resource given the + # operation's parameters. These two operations are the OSS + # regime's implementation of that contract. + # + # Inputs (on IamRequest): + # user_id — the identity handle (the gateway's + # opaque reference). For OSS this is the + # user record's id. + # capability — the capability string from the + # capabilities.md vocabulary. + # resource_json — JSON dict, the resource address + # ({} for system, {workspace} for + # workspace, {workspace, flow} for flow). + # parameters_json — JSON dict, decision-relevant operation + # parameters (e.g. workspace association + # on user-registry operations). + # authorise_checks — for authorise-many, a JSON list of + # {capability, resource, parameters}. + # + # Outputs (on IamResponse): + # decision_allow — single allow / deny verdict. + # decision_ttl_seconds — gateway cache TTL for this + # decision. + # decisions_json — for authorise-many, list of + # {allow, ttl} in request order. + # ------------------------------------------------------------------ + + def _decide(self, user_row, capability, resource, parameters): + """Single authorisation decision. Returns (allow, ttl).""" + + if user_row is None: + return False, AUTHZ_CACHE_TTL_SECONDS + + # user_row layout: + # 0:id 1:workspace 2:username 3:name 4:email 5:password_hash + # 6:roles 7:enabled 8:must_change_password 9:created + if not user_row[7]: # disabled + return False, AUTHZ_CACHE_TTL_SECONDS + + # Disabled workspace check (defense in depth — credentials + # bound to a disabled workspace shouldn't be able to act). + # Cheap; one row read. + # We do this only when a target workspace is involved, to + # avoid an extra read for system-level operations that + # bypass workspace altogether. + target_workspace = ( + (resource or {}).get("workspace") + or (parameters or {}).get("workspace") + ) + + roles = user_row[6] or set() + assigned_workspace = user_row[1] + + for role_name in roles: + defn = ROLE_DEFINITIONS.get(role_name) + if defn is None: + continue + if capability not in defn["capabilities"]: + continue + if target_workspace is None or _scope_permits( + defn["workspace_scope"], + target_workspace, + assigned_workspace, + ): + return True, AUTHZ_CACHE_TTL_SECONDS + + return False, AUTHZ_CACHE_TTL_SECONDS + + async def handle_authorise(self, v): + if not v.capability: + return _err("invalid-argument", "capability required") + if not v.user_id: + return _err("invalid-argument", "user_id (handle) required") + + try: + resource = json.loads(v.resource_json or "{}") + parameters = json.loads(v.parameters_json or "{}") + except json.JSONDecodeError as e: + return _err("invalid-argument", f"bad json: {e}") + + user_row = await self.table_store.get_user(v.user_id) + allow, ttl = self._decide( + user_row, v.capability, resource, parameters, + ) + return IamResponse( + decision_allow=allow, + decision_ttl_seconds=ttl, + ) + + async def handle_authorise_many(self, v): + if not v.user_id: + return _err("invalid-argument", "user_id (handle) required") + if not v.authorise_checks: + return _err("invalid-argument", "authorise_checks required") + + try: + checks = json.loads(v.authorise_checks) + except json.JSONDecodeError as e: + return _err("invalid-argument", f"bad json: {e}") + if not isinstance(checks, list): + return _err( + "invalid-argument", + "authorise_checks must be a JSON list", + ) + + # One user lookup for the whole batch. + user_row = await self.table_store.get_user(v.user_id) + + decisions = [] + for c in checks: + if not isinstance(c, dict): + decisions.append({ + "allow": False, + "ttl": AUTHZ_CACHE_TTL_SECONDS, + }) + continue + allow, ttl = self._decide( + user_row, + c.get("capability", ""), + c.get("resource") or {}, + c.get("parameters") or {}, + ) + decisions.append({"allow": allow, "ttl": ttl}) + + return IamResponse(decisions_json=json.dumps(decisions)) diff --git a/trustgraph-flow/trustgraph/iam/service/service.py b/trustgraph-flow/trustgraph/iam/service/service.py new file mode 100644 index 00000000..147bd56a --- /dev/null +++ b/trustgraph-flow/trustgraph/iam/service/service.py @@ -0,0 +1,233 @@ +""" +IAM service processor. Terminates the IAM request queue and forwards +each request to the IamService business logic, then returns the +response on the IAM response queue. + +Shape mirrors trustgraph.config.service. +""" + +import logging +import os + +from trustgraph.schema import Error +from trustgraph.schema import IamRequest, IamResponse +from trustgraph.schema import iam_request_queue, iam_response_queue + +from trustgraph.base import AsyncProcessor, Consumer, Producer +from trustgraph.base import ConsumerMetrics, ProducerMetrics +from trustgraph.base.cassandra_config import ( + add_cassandra_args, resolve_cassandra_config, +) + +from . iam import IamService + +logger = logging.getLogger(__name__) + +default_ident = "iam-svc" + +default_iam_request_queue = iam_request_queue +default_iam_response_queue = iam_response_queue + +# Environment variables consulted as a fallback when the +# corresponding params field is not set in the processor-group YAML +# or via CLI. Intended for K8s Secret / env-var injection so the +# bootstrap token never has to live in the YAML (and thus in git). +ENV_BOOTSTRAP_MODE = "IAM_BOOTSTRAP_MODE" +ENV_BOOTSTRAP_TOKEN = "IAM_BOOTSTRAP_TOKEN" + + +class Processor(AsyncProcessor): + + def __init__(self, **params): + + iam_req_q = params.get( + "iam_request_queue", default_iam_request_queue, + ) + iam_resp_q = params.get( + "iam_response_queue", default_iam_response_queue, + ) + + # Resolve bootstrap mode + token. Precedence: explicit + # params (CLI / processor-group YAML) → environment variable + # → unset (fail-closed). The env-var path is the K8s-native + # injection point: an `IAM_BOOTSTRAP_TOKEN` from a Secret + # never has to land in the YAML, and therefore never enters + # git history. + bootstrap_mode = ( + params.get("bootstrap_mode") + or os.environ.get(ENV_BOOTSTRAP_MODE) + ) + bootstrap_token = ( + params.get("bootstrap_token") + or os.environ.get(ENV_BOOTSTRAP_TOKEN) + ) + + if bootstrap_mode not in ("token", "bootstrap"): + raise RuntimeError( + "iam-svc: bootstrap-mode is required. Set to 'token' " + "(with bootstrap-token) for production, or 'bootstrap' " + "to enable the explicit bootstrap operation over the " + "pub/sub bus (dev / quick-start only, not safe under " + "public exposure). Configurable via processor-group " + f"params or the {ENV_BOOTSTRAP_MODE} environment " + "variable. Refusing to start." + ) + if bootstrap_mode == "token" and not bootstrap_token: + raise RuntimeError( + "iam-svc: bootstrap-mode=token requires bootstrap-token " + f"(or the {ENV_BOOTSTRAP_TOKEN} environment " + "variable). Refusing to start." + ) + if bootstrap_mode == "bootstrap" and bootstrap_token: + raise RuntimeError( + "iam-svc: bootstrap-token is not accepted when " + "bootstrap-mode=bootstrap. Ambiguous intent. " + "Refusing to start." + ) + + self.bootstrap_mode = bootstrap_mode + self.bootstrap_token = bootstrap_token + + cassandra_host = params.get("cassandra_host") + cassandra_username = params.get("cassandra_username") + cassandra_password = params.get("cassandra_password") + + hosts, username, password, keyspace = resolve_cassandra_config( + host=cassandra_host, + username=cassandra_username, + password=cassandra_password, + default_keyspace="iam", + ) + + self.cassandra_host = hosts + self.cassandra_username = username + self.cassandra_password = password + + super().__init__( + **params | { + "iam_request_schema": IamRequest.__name__, + "iam_response_schema": IamResponse.__name__, + "cassandra_host": self.cassandra_host, + "cassandra_username": self.cassandra_username, + "cassandra_password": self.cassandra_password, + } + ) + + iam_request_metrics = ConsumerMetrics( + processor=self.id, flow=None, name="iam-request", + ) + iam_response_metrics = ProducerMetrics( + processor=self.id, flow=None, name="iam-response", + ) + + self.iam_request_topic = iam_req_q + + self.iam_request_consumer = Consumer( + taskgroup=self.taskgroup, + backend=self.pubsub, + flow=None, + topic=iam_req_q, + subscriber=self.id, + schema=IamRequest, + handler=self.on_iam_request, + metrics=iam_request_metrics, + ) + + self.iam_response_producer = Producer( + backend=self.pubsub, + topic=iam_resp_q, + schema=IamResponse, + metrics=iam_response_metrics, + ) + + self.iam = IamService( + host=self.cassandra_host, + username=self.cassandra_username, + password=self.cassandra_password, + keyspace=keyspace, + bootstrap_mode=self.bootstrap_mode, + bootstrap_token=self.bootstrap_token, + ) + + logger.info( + f"IAM service initialised (bootstrap-mode={self.bootstrap_mode})" + ) + + async def start(self): + await self.pubsub.ensure_topic(self.iam_request_topic) + # Token-mode auto-bootstrap runs before we accept requests so + # the first inbound call always sees a populated table. + await self.iam.auto_bootstrap_if_token_mode() + await self.iam_request_consumer.start() + + async def on_iam_request(self, msg, consumer, flow): + + id = None + try: + v = msg.value() + id = msg.properties()["id"] + logger.debug( + f"Handling IAM request {id} op={v.operation!r}" + ) + resp = await self.iam.handle(v) + await self.iam_response_producer.send( + resp, properties={"id": id}, + ) + except Exception as e: + logger.error( + f"IAM request failed: {type(e).__name__}: {e}", + exc_info=True, + ) + resp = IamResponse( + error=Error(type="internal-error", message=str(e)), + ) + if id is not None: + await self.iam_response_producer.send( + resp, properties={"id": id}, + ) + + @staticmethod + def add_args(parser): + AsyncProcessor.add_args(parser) + + parser.add_argument( + "--iam-request-queue", + default=default_iam_request_queue, + help=f"IAM request queue (default: {default_iam_request_queue})", + ) + parser.add_argument( + "--iam-response-queue", + default=default_iam_response_queue, + help=f"IAM response queue (default: {default_iam_response_queue})", + ) + parser.add_argument( + "--bootstrap-mode", + default=None, + choices=["token", "bootstrap"], + help=( + "IAM bootstrap mode (required). " + "'token' = operator supplies the initial admin API " + "key via --bootstrap-token; auto-seeds on first start, " + "bootstrap operation refused. " + "'bootstrap' = bootstrap operation is live over the " + "bus until tables are populated; a token is generated " + "and returned by tg-bootstrap-iam. Unsafe to run " + "'bootstrap' mode with public exposure." + ), + ) + parser.add_argument( + "--bootstrap-token", + default=None, + help=( + "Initial admin API key plaintext, required when " + "--bootstrap-mode=token. Treat as a one-time " + "credential: the operator should rotate to a new key " + "and revoke this one after first use." + ), + ) + + add_cassandra_args(parser) + + +def run(): + Processor.launch(default_ident, __doc__) diff --git a/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py b/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py index f6c5dcb8..2e537fde 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py @@ -4,7 +4,7 @@ Simple LLM service, performs text prompt completion using an Ollama service. Input is prompt, output is response. """ -from ollama import Client +from ollama import AsyncClient import os import logging @@ -38,23 +38,23 @@ class Processor(LlmService): self.default_model = model self.temperature = temperature - self.llm = Client(host=ollama) + self.llm = AsyncClient(host=ollama) self._checked_models = set() - def _ensure_model(self, model_name): + async def _ensure_model(self, model_name): """Check if model exists locally, pull it if not.""" if model_name in self._checked_models: return try: - self.llm.show(model_name) + await self.llm.show(model_name) self._checked_models.add(model_name) except Exception as e: status_code = getattr(e, 'status_code', None) if status_code == 404 or "not found" in str(e).lower(): logger.info(f"Ollama model '{model_name}' not found locally. Pulling, this may take a while...") try: - self.llm.pull(model_name) + await self.llm.pull(model_name) self._checked_models.add(model_name) logger.info(f"Successfully pulled Ollama model '{model_name}'.") except Exception as pull_e: @@ -66,9 +66,9 @@ class Processor(LlmService): # Use provided model or fall back to default model_name = model or self.default_model - + # Ensure the model exists/is pulled - self._ensure_model(model_name) + await self._ensure_model(model_name) # Use provided temperature or fall back to default effective_temperature = temperature if temperature is not None else self.temperature @@ -79,7 +79,7 @@ class Processor(LlmService): try: - response = self.llm.generate(model_name, prompt, options={'temperature': effective_temperature}) + response = await self.llm.generate(model_name, prompt, options={'temperature': effective_temperature}) response_text = response['response'] logger.debug("Sending response...") @@ -113,7 +113,7 @@ class Processor(LlmService): model_name = model or self.default_model # Ensure the model exists/is pulled - self._ensure_model(model_name) + await self._ensure_model(model_name) effective_temperature = temperature if temperature is not None else self.temperature @@ -123,7 +123,7 @@ class Processor(LlmService): prompt = system + "\n\n" + prompt try: - stream = self.llm.generate( + stream = await self.llm.generate( model_name, prompt, options={'temperature': effective_temperature}, @@ -133,7 +133,7 @@ class Processor(LlmService): total_input_tokens = 0 total_output_tokens = 0 - for chunk in stream: + async for chunk in stream: if 'response' in chunk and chunk['response']: yield LlmChunk( text=chunk['response'], diff --git a/trustgraph-flow/trustgraph/tables/iam.py b/trustgraph-flow/trustgraph/tables/iam.py new file mode 100644 index 00000000..f1a0734f --- /dev/null +++ b/trustgraph-flow/trustgraph/tables/iam.py @@ -0,0 +1,436 @@ +""" +IAM Cassandra table store. + +Tables: + - iam_workspaces (id primary key) + - iam_users (id primary key) + iam_users_by_username lookup table + (workspace, username) -> id + - iam_api_keys (key_hash primary key) with secondary index on user_id + - iam_signing_keys (kid primary key) — RSA keypairs for JWT signing + +See docs/tech-specs/iam-protocol.md for the wire-level context. +""" + +import logging + +from cassandra.cluster import Cluster +from cassandra.auth import PlainTextAuthProvider +from ssl import SSLContext, PROTOCOL_TLSv1_2 + +from . cassandra_async import async_execute + +logger = logging.getLogger(__name__) + + +class IamTableStore: + + def __init__( + self, + cassandra_host, cassandra_username, cassandra_password, + keyspace, + ): + self.keyspace = keyspace + + logger.info("IAM: connecting to Cassandra...") + + if isinstance(cassandra_host, str): + cassandra_host = [h.strip() for h in cassandra_host.split(",")] + + if cassandra_username and cassandra_password: + ssl_context = SSLContext(PROTOCOL_TLSv1_2) + auth_provider = PlainTextAuthProvider( + username=cassandra_username, password=cassandra_password, + ) + self.cluster = Cluster( + cassandra_host, + auth_provider=auth_provider, + ssl_context=ssl_context, + ) + else: + self.cluster = Cluster(cassandra_host) + + self.cassandra = self.cluster.connect() + + logger.info("IAM: connected.") + + self._ensure_schema() + self._prepare_statements() + + def _ensure_schema(self): + # FIXME: Replication factor should be configurable. + self.cassandra.execute(f""" + create keyspace if not exists {self.keyspace} + with replication = {{ + 'class' : 'SimpleStrategy', + 'replication_factor' : 1 + }}; + """) + self.cassandra.set_keyspace(self.keyspace) + + self.cassandra.execute(""" + CREATE TABLE IF NOT EXISTS iam_workspaces ( + id text PRIMARY KEY, + name text, + enabled boolean, + created timestamp + ); + """) + + self.cassandra.execute(""" + CREATE TABLE IF NOT EXISTS iam_users ( + id text PRIMARY KEY, + workspace text, + username text, + name text, + email text, + password_hash text, + roles set, + enabled boolean, + must_change_password boolean, + created timestamp + ); + """) + + self.cassandra.execute(""" + CREATE TABLE IF NOT EXISTS iam_users_by_username ( + workspace text, + username text, + user_id text, + PRIMARY KEY ((workspace), username) + ); + """) + + self.cassandra.execute(""" + CREATE TABLE IF NOT EXISTS iam_api_keys ( + key_hash text PRIMARY KEY, + id text, + user_id text, + name text, + prefix text, + expires timestamp, + created timestamp, + last_used timestamp + ); + """) + + self.cassandra.execute(""" + CREATE INDEX IF NOT EXISTS iam_api_keys_user_id_idx + ON iam_api_keys (user_id); + """) + + self.cassandra.execute(""" + CREATE INDEX IF NOT EXISTS iam_api_keys_id_idx + ON iam_api_keys (id); + """) + + self.cassandra.execute(""" + CREATE TABLE IF NOT EXISTS iam_signing_keys ( + kid text PRIMARY KEY, + private_pem text, + public_pem text, + created timestamp, + retired timestamp + ); + """) + + logger.info("IAM: Cassandra schema OK.") + + def _prepare_statements(self): + c = self.cassandra + + self.put_workspace_stmt = c.prepare(""" + INSERT INTO iam_workspaces (id, name, enabled, created) + VALUES (?, ?, ?, ?) + """) + self.get_workspace_stmt = c.prepare(""" + SELECT id, name, enabled, created FROM iam_workspaces + WHERE id = ? + """) + self.list_workspaces_stmt = c.prepare(""" + SELECT id, name, enabled, created FROM iam_workspaces + """) + + self.put_user_stmt = c.prepare(""" + INSERT INTO iam_users ( + id, workspace, username, name, email, password_hash, + roles, enabled, must_change_password, created + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """) + self.get_user_stmt = c.prepare(""" + SELECT id, workspace, username, name, email, password_hash, + roles, enabled, must_change_password, created + FROM iam_users WHERE id = ? + """) + self.list_users_by_workspace_stmt = c.prepare(""" + SELECT id, workspace, username, name, email, password_hash, + roles, enabled, must_change_password, created + FROM iam_users WHERE workspace = ? ALLOW FILTERING + """) + self.list_users_stmt = c.prepare(""" + SELECT id, workspace, username, name, email, password_hash, + roles, enabled, must_change_password, created + FROM iam_users + """) + + self.put_username_lookup_stmt = c.prepare(""" + INSERT INTO iam_users_by_username (workspace, username, user_id) + VALUES (?, ?, ?) + """) + self.get_user_id_by_username_stmt = c.prepare(""" + SELECT user_id FROM iam_users_by_username + WHERE workspace = ? AND username = ? + """) + self.delete_username_lookup_stmt = c.prepare(""" + DELETE FROM iam_users_by_username + WHERE workspace = ? AND username = ? + """) + self.delete_user_stmt = c.prepare(""" + DELETE FROM iam_users WHERE id = ? + """) + + self.put_api_key_stmt = c.prepare(""" + INSERT INTO iam_api_keys ( + key_hash, id, user_id, name, prefix, expires, + created, last_used + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """) + self.get_api_key_by_hash_stmt = c.prepare(""" + SELECT key_hash, id, user_id, name, prefix, expires, + created, last_used + FROM iam_api_keys WHERE key_hash = ? + """) + self.get_api_key_by_id_stmt = c.prepare(""" + SELECT key_hash, id, user_id, name, prefix, expires, + created, last_used + FROM iam_api_keys WHERE id = ? + """) + self.list_api_keys_by_user_stmt = c.prepare(""" + SELECT key_hash, id, user_id, name, prefix, expires, + created, last_used + FROM iam_api_keys WHERE user_id = ? + """) + self.delete_api_key_stmt = c.prepare(""" + DELETE FROM iam_api_keys WHERE key_hash = ? + """) + + self.put_signing_key_stmt = c.prepare(""" + INSERT INTO iam_signing_keys ( + kid, private_pem, public_pem, created, retired + ) + VALUES (?, ?, ?, ?, ?) + """) + self.list_signing_keys_stmt = c.prepare(""" + SELECT kid, private_pem, public_pem, created, retired + FROM iam_signing_keys + """) + self.retire_signing_key_stmt = c.prepare(""" + UPDATE iam_signing_keys SET retired = ? WHERE kid = ? + """) + + self.update_user_profile_stmt = c.prepare(""" + UPDATE iam_users + SET name = ?, email = ?, roles = ?, enabled = ?, + must_change_password = ? + WHERE id = ? + """) + self.update_user_password_stmt = c.prepare(""" + UPDATE iam_users + SET password_hash = ?, must_change_password = ? + WHERE id = ? + """) + self.update_user_enabled_stmt = c.prepare(""" + UPDATE iam_users SET enabled = ? WHERE id = ? + """) + + self.update_workspace_stmt = c.prepare(""" + UPDATE iam_workspaces SET name = ?, enabled = ? + WHERE id = ? + """) + + # ------------------------------------------------------------------ + # Workspaces + # ------------------------------------------------------------------ + + async def put_workspace(self, id, name, enabled, created): + await async_execute( + self.cassandra, self.put_workspace_stmt, + (id, name, enabled, created), + ) + + async def get_workspace(self, id): + rows = await async_execute( + self.cassandra, self.get_workspace_stmt, (id,), + ) + return rows[0] if rows else None + + async def list_workspaces(self): + return await async_execute( + self.cassandra, self.list_workspaces_stmt, + ) + + # ------------------------------------------------------------------ + # Users + # ------------------------------------------------------------------ + + async def put_user( + self, id, workspace, username, name, email, password_hash, + roles, enabled, must_change_password, created, + ): + await async_execute( + self.cassandra, self.put_user_stmt, + ( + id, workspace, username, name, email, password_hash, + set(roles) if roles else set(), + enabled, must_change_password, created, + ), + ) + await async_execute( + self.cassandra, self.put_username_lookup_stmt, + (workspace, username, id), + ) + + async def get_user(self, id): + rows = await async_execute( + self.cassandra, self.get_user_stmt, (id,), + ) + return rows[0] if rows else None + + async def get_user_id_by_username(self, workspace, username): + rows = await async_execute( + self.cassandra, self.get_user_id_by_username_stmt, + (workspace, username), + ) + return rows[0][0] if rows else None + + async def list_users_by_workspace(self, workspace): + return await async_execute( + self.cassandra, self.list_users_by_workspace_stmt, (workspace,), + ) + + async def list_users(self): + """List every user across the deployment. Used by the + system-level list-users handler when no workspace filter is + supplied; the gateway has already authorised the call against + the caller's authority.""" + return await async_execute( + self.cassandra, self.list_users_stmt, (), + ) + + async def delete_user(self, id): + await async_execute( + self.cassandra, self.delete_user_stmt, (id,), + ) + + async def delete_username_lookup(self, workspace, username): + await async_execute( + self.cassandra, self.delete_username_lookup_stmt, + (workspace, username), + ) + + # ------------------------------------------------------------------ + # API keys + # ------------------------------------------------------------------ + + async def put_api_key( + self, key_hash, id, user_id, name, prefix, expires, + created, last_used, + ): + await async_execute( + self.cassandra, self.put_api_key_stmt, + (key_hash, id, user_id, name, prefix, expires, + created, last_used), + ) + + async def get_api_key_by_hash(self, key_hash): + rows = await async_execute( + self.cassandra, self.get_api_key_by_hash_stmt, (key_hash,), + ) + return rows[0] if rows else None + + async def get_api_key_by_id(self, id): + rows = await async_execute( + self.cassandra, self.get_api_key_by_id_stmt, (id,), + ) + return rows[0] if rows else None + + async def list_api_keys_by_user(self, user_id): + return await async_execute( + self.cassandra, self.list_api_keys_by_user_stmt, (user_id,), + ) + + async def delete_api_key(self, key_hash): + await async_execute( + self.cassandra, self.delete_api_key_stmt, (key_hash,), + ) + + # ------------------------------------------------------------------ + # Signing keys + # ------------------------------------------------------------------ + + async def put_signing_key(self, kid, private_pem, public_pem, + created, retired): + await async_execute( + self.cassandra, self.put_signing_key_stmt, + (kid, private_pem, public_pem, created, retired), + ) + + async def list_signing_keys(self): + return await async_execute( + self.cassandra, self.list_signing_keys_stmt, + ) + + async def retire_signing_key(self, kid, retired): + await async_execute( + self.cassandra, self.retire_signing_key_stmt, + (retired, kid), + ) + + # ------------------------------------------------------------------ + # User partial updates + # ------------------------------------------------------------------ + + async def update_user_profile( + self, id, name, email, roles, enabled, must_change_password, + ): + await async_execute( + self.cassandra, self.update_user_profile_stmt, + ( + name, email, + set(roles) if roles else set(), + enabled, must_change_password, id, + ), + ) + + async def update_user_password( + self, id, password_hash, must_change_password, + ): + await async_execute( + self.cassandra, self.update_user_password_stmt, + (password_hash, must_change_password, id), + ) + + async def update_user_enabled(self, id, enabled): + await async_execute( + self.cassandra, self.update_user_enabled_stmt, + (enabled, id), + ) + + # ------------------------------------------------------------------ + # Workspace updates + # ------------------------------------------------------------------ + + async def update_workspace(self, id, name, enabled): + await async_execute( + self.cassandra, self.update_workspace_stmt, + (name, enabled, id), + ) + + # ------------------------------------------------------------------ + # Bootstrap helpers + # ------------------------------------------------------------------ + + async def any_workspace_exists(self): + rows = await self.list_workspaces() + return bool(rows)