Merge branch 'release/v2.4'

This commit is contained in:
Cyber MacGeddon 2026-04-29 17:56:48 +01:00
commit f3434307c5
91 changed files with 10657 additions and 1218 deletions

View file

@ -60,8 +60,10 @@ agent-orchestrator = "trustgraph.agent.orchestrator:run"
api-gateway = "trustgraph.gateway:run"
chunker-recursive = "trustgraph.chunking.recursive:run"
chunker-token = "trustgraph.chunking.token:run"
bootstrap = "trustgraph.bootstrap.bootstrapper:run"
config-svc = "trustgraph.config.service:run"
flow-svc = "trustgraph.flow.service:run"
iam-svc = "trustgraph.iam.service:run"
doc-embeddings-query-milvus = "trustgraph.query.doc_embeddings.milvus:run"
doc-embeddings-query-pinecone = "trustgraph.query.doc_embeddings.pinecone:run"
doc-embeddings-query-qdrant = "trustgraph.query.doc_embeddings.qdrant:run"

View file

@ -0,0 +1,68 @@
"""
Bootstrap framework: Initialiser base class and per-wake context.
See docs/tech-specs/bootstrap.md for the full design.
"""
import logging
from dataclasses import dataclass
from typing import Any
@dataclass
class InitContext:
"""Shared per-wake context passed to each initialiser.
The bootstrapper constructs one of these on every wake cycle,
tears it down at cycle end, and passes it into each initialiser's
``run()`` method. Fields are short-lived and safe to use during
a single cycle only.
"""
logger: logging.Logger
config: Any # ConfigClient
flow: Any # RequestResponse client for flow-svc
class Initialiser:
"""Base class for bootstrap initialisers.
Subclasses implement :meth:`run`. The bootstrapper manages
completion state, flag comparison, retry and error handling
subclasses describe only the work to perform.
Class attributes:
* ``wait_for_services`` (bool, default ``True``): when ``True`` the
initialiser only runs after the bootstrapper's service gate has
passed (config-svc and flow-svc reachable). Set ``False`` for
initialisers that bring up infrastructure the gate itself
depends on principally Pulsar topology, without which
config-svc cannot come online.
"""
wait_for_services: bool = True
def __init__(self, **params):
# Subclasses should consume their own params via keyword
# arguments in their own __init__ signatures. This catch-all
# is here so any kwargs that filter through unnoticed don't
# raise TypeError on construction.
pass
async def run(self, ctx, old_flag, new_flag):
"""Perform initialisation work.
:param ctx: :class:`InitContext` with logger, config client,
flow-svc client.
:param old_flag: Previously-stored flag string, or ``None`` if
this initialiser has never successfully completed in this
deployment.
:param new_flag: Currently-configured flag. A string chosen
by the operator; typically something like ``"v1"``.
:raises: Any exception on failure. The bootstrapper catches,
logs, and re-runs on the next cycle; completion state is
only written on clean return.
"""
raise NotImplementedError

View file

@ -0,0 +1 @@
from . service import *

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from . service import run
if __name__ == '__main__':
run()

View file

@ -0,0 +1,414 @@
"""
Bootstrapper processor.
Runs a pluggable list of initialisers in a reconciliation loop.
Each initialiser's completion state is recorded in the reserved
``__system__`` workspace under the ``init-state`` config type.
See docs/tech-specs/bootstrap.md for the full design.
"""
import asyncio
import importlib
import json
import logging
import uuid
from argparse import ArgumentParser
from dataclasses import dataclass
from trustgraph.base import AsyncProcessor
from trustgraph.base import ProducerMetrics, SubscriberMetrics
from trustgraph.base.config_client import ConfigClient
from trustgraph.base.request_response_spec import RequestResponse
from trustgraph.schema import (
ConfigRequest, ConfigResponse,
config_request_queue, config_response_queue,
)
from trustgraph.schema import (
FlowRequest, FlowResponse,
flow_request_queue, flow_response_queue,
)
from .. base import Initialiser, InitContext
logger = logging.getLogger(__name__)
default_ident = "bootstrap"
# Reserved workspace + config type under which completion state is
# stored. Reserved (`_`-prefix) workspaces are excluded from the
# config push broadcast — live processors never see these keys.
SYSTEM_WORKSPACE = "__system__"
INIT_STATE_TYPE = "init-state"
# Cadence tiers.
GATE_BACKOFF = 5 # Services not responding; retry soon.
INIT_RETRY = 15 # Gate passed but something ran/failed;
# converge quickly.
STEADY_INTERVAL = 300 # Everything at target flag; idle cheaply.
@dataclass
class InitialiserSpec:
"""One entry in the bootstrapper's configured list of initialisers."""
name: str
flag: str
instance: Initialiser
def _resolve_class(dotted):
"""Import and return a class by its dotted path."""
module_path, _, class_name = dotted.rpartition(".")
if not module_path:
raise ValueError(
f"Initialiser class must be a dotted path, got {dotted!r}"
)
module = importlib.import_module(module_path)
return getattr(module, class_name)
def _load_initialisers_file(path):
"""Load the initialisers spec list from a YAML or JSON file.
File shape:
.. code-block:: yaml
initialisers:
- class: trustgraph.bootstrap.initialisers.PulsarTopology
name: pulsar-topology
flag: v1
params:
admin_url: http://pulsar:8080
tenant: tg
- ...
"""
with open(path) as f:
content = f.read()
if path.endswith((".yaml", ".yml")):
import yaml
doc = yaml.safe_load(content)
else:
doc = json.loads(content)
if not isinstance(doc, dict) or "initialisers" not in doc:
raise RuntimeError(
f"{path}: expected a mapping with an 'initialisers' key"
)
return doc["initialisers"]
class Processor(AsyncProcessor):
def __init__(self, **params):
super().__init__(**params)
# Source the initialisers list either from a direct parameter
# (processor-group embedding) or from a file (CLI launch).
inits = params.get("initialisers")
if inits is None:
inits_file = params.get("initialisers_file")
if inits_file is None:
raise RuntimeError(
"Bootstrapper requires either the 'initialisers' "
"parameter or --initialisers-file"
)
inits = _load_initialisers_file(inits_file)
self.specs = []
names = set()
for entry in inits:
if not isinstance(entry, dict):
raise RuntimeError(
f"Initialiser entry must be a mapping, got: {entry!r}"
)
for required in ("class", "name", "flag"):
if required not in entry:
raise RuntimeError(
f"Initialiser entry missing required field "
f"{required!r}: {entry!r}"
)
name = entry["name"]
if name in names:
raise RuntimeError(f"Duplicate initialiser name {name!r}")
names.add(name)
cls = _resolve_class(entry["class"])
try:
instance = cls(**entry.get("params", {}))
except Exception as e:
raise RuntimeError(
f"Failed to instantiate initialiser "
f"{entry['class']!r} as {name!r}: "
f"{type(e).__name__}: {e}"
)
self.specs.append(InitialiserSpec(
name=name,
flag=entry["flag"],
instance=instance,
))
logger.info(
f"Bootstrapper: loaded {len(self.specs)} initialisers"
)
# ------------------------------------------------------------------
# Client construction (short-lived per wake cycle).
# ------------------------------------------------------------------
def _make_config_client(self):
rr_id = str(uuid.uuid4())
return ConfigClient(
backend=self.pubsub_backend,
subscription=f"{self.id}--config--{rr_id}",
consumer_name=self.id,
request_topic=config_request_queue,
request_schema=ConfigRequest,
request_metrics=ProducerMetrics(
processor=self.id, flow=None, name="config-request",
),
response_topic=config_response_queue,
response_schema=ConfigResponse,
response_metrics=SubscriberMetrics(
processor=self.id, flow=None, name="config-response",
),
)
def _make_flow_client(self):
rr_id = str(uuid.uuid4())
return RequestResponse(
backend=self.pubsub_backend,
subscription=f"{self.id}--flow--{rr_id}",
consumer_name=self.id,
request_topic=flow_request_queue,
request_schema=FlowRequest,
request_metrics=ProducerMetrics(
processor=self.id, flow=None, name="flow-request",
),
response_topic=flow_response_queue,
response_schema=FlowResponse,
response_metrics=SubscriberMetrics(
processor=self.id, flow=None, name="flow-response",
),
)
async def _open_clients(self):
config = self._make_config_client()
flow = self._make_flow_client()
await config.start()
try:
await flow.start()
except Exception:
await self._safe_stop(config)
raise
return config, flow
async def _safe_stop(self, client):
try:
await client.stop()
except Exception:
pass
# ------------------------------------------------------------------
# Service gate.
# ------------------------------------------------------------------
async def _gate_ready(self, config, flow):
try:
await config.keys(SYSTEM_WORKSPACE, INIT_STATE_TYPE)
except Exception as e:
logger.info(
f"Gate: config-svc not ready ({type(e).__name__}: {e})"
)
return False
try:
resp = await flow.request(
FlowRequest(
operation="list-blueprints",
workspace=SYSTEM_WORKSPACE,
),
timeout=5,
)
if resp.error:
logger.info(
f"Gate: flow-svc error: "
f"{resp.error.type}: {resp.error.message}"
)
return False
except Exception as e:
logger.info(
f"Gate: flow-svc not ready ({type(e).__name__}: {e})"
)
return False
return True
# ------------------------------------------------------------------
# Completion state.
# ------------------------------------------------------------------
async def _stored_flag(self, config, name):
raw = await config.get(SYSTEM_WORKSPACE, INIT_STATE_TYPE, name)
if raw is None:
return None
try:
return json.loads(raw)
except Exception:
return raw
async def _store_flag(self, config, name, flag):
await config.put(
SYSTEM_WORKSPACE, INIT_STATE_TYPE, name,
json.dumps(flag),
)
# ------------------------------------------------------------------
# Per-spec execution.
# ------------------------------------------------------------------
async def _run_spec(self, spec, config, flow):
"""Run a single initialiser spec.
Returns one of:
- ``"skip"``: stored flag already matches target, nothing to do.
- ``"ran"``: initialiser ran and completion state was updated.
- ``"failed"``: initialiser raised.
- ``"failed-state-write"``: initialiser succeeded but we could
not persist the new flag (transient will re-run next cycle).
"""
try:
old_flag = await self._stored_flag(config, spec.name)
except Exception as e:
logger.warning(
f"{spec.name}: could not read stored flag "
f"({type(e).__name__}: {e})"
)
return "failed"
if old_flag == spec.flag:
return "skip"
child_logger = logger.getChild(spec.name)
child_ctx = InitContext(
logger=child_logger,
config=config,
flow=flow,
)
child_logger.info(
f"Running (old_flag={old_flag!r} -> new_flag={spec.flag!r})"
)
try:
await spec.instance.run(child_ctx, old_flag, spec.flag)
except Exception as e:
child_logger.error(
f"Failed: {type(e).__name__}: {e}", exc_info=True,
)
return "failed"
try:
await self._store_flag(config, spec.name, spec.flag)
except Exception as e:
child_logger.warning(
f"Completed but could not persist state flag "
f"({type(e).__name__}: {e}); will re-run next cycle"
)
return "failed-state-write"
child_logger.info(f"Completed (flag={spec.flag!r})")
return "ran"
# ------------------------------------------------------------------
# Main loop.
# ------------------------------------------------------------------
async def run(self):
logger.info(
f"Bootstrapper starting with {len(self.specs)} initialisers"
)
while self.running:
sleep_for = STEADY_INTERVAL
try:
config, flow = await self._open_clients()
except Exception as e:
logger.info(
f"Failed to open clients "
f"({type(e).__name__}: {e}); retry in {GATE_BACKOFF}s"
)
await asyncio.sleep(GATE_BACKOFF)
continue
try:
# Phase 1: pre-service initialisers run unconditionally.
pre_specs = [
s for s in self.specs
if not s.instance.wait_for_services
]
pre_results = {}
for spec in pre_specs:
pre_results[spec.name] = await self._run_spec(
spec, config, flow,
)
# Phase 2: gate.
gate_ok = await self._gate_ready(config, flow)
# Phase 3: post-service initialisers, if gate passed.
post_results = {}
if gate_ok:
post_specs = [
s for s in self.specs
if s.instance.wait_for_services
]
for spec in post_specs:
post_results[spec.name] = await self._run_spec(
spec, config, flow,
)
# Cadence selection.
if not gate_ok:
sleep_for = GATE_BACKOFF
else:
all_results = {**pre_results, **post_results}
if any(r != "skip" for r in all_results.values()):
sleep_for = INIT_RETRY
else:
sleep_for = STEADY_INTERVAL
finally:
await self._safe_stop(config)
await self._safe_stop(flow)
await asyncio.sleep(sleep_for)
# ------------------------------------------------------------------
# CLI arg plumbing.
# ------------------------------------------------------------------
@staticmethod
def add_args(parser: ArgumentParser) -> None:
AsyncProcessor.add_args(parser)
parser.add_argument(
'-c', '--initialisers-file',
help='Path to YAML or JSON file describing the '
'initialisers to run. Ignored when the '
"'initialisers' parameter is provided directly "
'(e.g. when running inside a processor group).',
)
def run():
Processor.launch(default_ident, __doc__)

View file

@ -0,0 +1,20 @@
"""
Core bootstrap initialisers.
These cover the base TrustGraph deployment case. Enterprise or
third-party initialisers live in their own packages and are
referenced in the bootstrapper's config by fully-qualified dotted
path.
"""
from . pulsar_topology import PulsarTopology
from . template_seed import TemplateSeed
from . workspace_init import WorkspaceInit
from . default_flow_start import DefaultFlowStart
__all__ = [
"PulsarTopology",
"TemplateSeed",
"WorkspaceInit",
"DefaultFlowStart",
]

View file

@ -0,0 +1,101 @@
"""
DefaultFlowStart initialiser starts a named flow in a workspace
using a specified blueprint.
Separated from WorkspaceInit so deployments that want a workspace
without an auto-started flow can simply omit this initialiser.
Parameters
----------
workspace : str (default "default")
Workspace in which to start the flow.
flow_id : str (default "default")
Identifier for the started flow.
blueprint : str (required)
Blueprint name (must already exist in the workspace's config,
typically via TemplateSeed -> WorkspaceInit).
description : str (default "Default")
Human-readable description passed to flow-svc.
parameters : dict (optional)
Optional parameter overrides passed to start-flow.
"""
from trustgraph.schema import FlowRequest
from .. base import Initialiser
class DefaultFlowStart(Initialiser):
def __init__(
self,
workspace="default",
flow_id="default",
blueprint=None,
description="Default",
parameters=None,
**kwargs,
):
super().__init__(**kwargs)
if not blueprint:
raise ValueError(
"DefaultFlowStart requires 'blueprint'"
)
self.workspace = workspace
self.flow_id = flow_id
self.blueprint = blueprint
self.description = description
self.parameters = dict(parameters) if parameters else {}
async def run(self, ctx, old_flag, new_flag):
# Check whether the flow already exists. Belt-and-braces
# beyond the flag gate: if an operator stops and restarts the
# bootstrapper after the flow is already running, we don't
# want to blindly try to start it again.
list_resp = await ctx.flow.request(
FlowRequest(
operation="list-flows",
workspace=self.workspace,
),
timeout=10,
)
if list_resp.error:
raise RuntimeError(
f"list-flows failed: "
f"{list_resp.error.type}: {list_resp.error.message}"
)
if self.flow_id in (list_resp.flow_ids or []):
ctx.logger.info(
f"Flow {self.flow_id!r} already running in workspace "
f"{self.workspace!r}; nothing to do"
)
return
ctx.logger.info(
f"Starting flow {self.flow_id!r} "
f"(blueprint={self.blueprint!r}) "
f"in workspace {self.workspace!r}"
)
resp = await ctx.flow.request(
FlowRequest(
operation="start-flow",
workspace=self.workspace,
flow_id=self.flow_id,
blueprint_name=self.blueprint,
description=self.description,
parameters=self.parameters,
),
timeout=30,
)
if resp.error:
raise RuntimeError(
f"start-flow failed: "
f"{resp.error.type}: {resp.error.message}"
)
ctx.logger.info(
f"Flow {self.flow_id!r} started"
)

View file

@ -0,0 +1,131 @@
"""
PulsarTopology initialiser creates Pulsar tenant and namespaces
with their retention policies.
Runs pre-gate (``wait_for_services = False``) because config-svc and
flow-svc can't connect to Pulsar until these namespaces exist.
Admin-API calls are idempotent so re-runs on flag change are safe.
"""
import asyncio
import requests
from .. base import Initialiser
# Namespace configs. flow/request take broker defaults. response
# and notify get aggressive retention — those classes carry short-lived
# request/response and notification traffic only.
NAMESPACE_CONFIG = {
"flow": {},
"request": {},
"response": {
"retention_policies": {
"retentionSizeInMB": -1,
"retentionTimeInMinutes": 3,
"subscriptionExpirationTimeMinutes": 30,
},
},
"notify": {
"retention_policies": {
"retentionSizeInMB": -1,
"retentionTimeInMinutes": 3,
"subscriptionExpirationTimeMinutes": 5,
},
},
}
REQUEST_TIMEOUT = 10
class PulsarTopology(Initialiser):
wait_for_services = False
def __init__(
self,
admin_url="http://pulsar:8080",
tenant="tg",
**kwargs,
):
super().__init__(**kwargs)
self.admin_url = admin_url.rstrip("/")
self.tenant = tenant
async def run(self, ctx, old_flag, new_flag):
# requests is blocking; offload to executor so the loop stays
# responsive.
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, self._reconcile_sync, ctx.logger)
# ------------------------------------------------------------------
# Sync admin-API calls.
# ------------------------------------------------------------------
def _get_clusters(self):
resp = requests.get(
f"{self.admin_url}/admin/v2/clusters",
timeout=REQUEST_TIMEOUT,
)
resp.raise_for_status()
return resp.json()
def _tenant_exists(self):
resp = requests.get(
f"{self.admin_url}/admin/v2/tenants/{self.tenant}",
timeout=REQUEST_TIMEOUT,
)
return resp.status_code == 200
def _create_tenant(self, clusters):
resp = requests.put(
f"{self.admin_url}/admin/v2/tenants/{self.tenant}",
json={"adminRoles": [], "allowedClusters": clusters},
timeout=REQUEST_TIMEOUT,
)
if resp.status_code != 204:
raise RuntimeError(
f"Tenant {self.tenant!r} create failed: "
f"{resp.status_code} {resp.text}"
)
def _namespace_exists(self, namespace):
resp = requests.get(
f"{self.admin_url}/admin/v2/namespaces/"
f"{self.tenant}/{namespace}",
timeout=REQUEST_TIMEOUT,
)
return resp.status_code == 200
def _create_namespace(self, namespace, config):
resp = requests.put(
f"{self.admin_url}/admin/v2/namespaces/"
f"{self.tenant}/{namespace}",
json=config,
timeout=REQUEST_TIMEOUT,
)
if resp.status_code != 204:
raise RuntimeError(
f"Namespace {self.tenant}/{namespace} create failed: "
f"{resp.status_code} {resp.text}"
)
def _reconcile_sync(self, logger):
if not self._tenant_exists():
clusters = self._get_clusters()
logger.info(
f"Creating tenant {self.tenant!r} with clusters {clusters}"
)
self._create_tenant(clusters)
else:
logger.debug(f"Tenant {self.tenant!r} already exists")
for namespace, config in NAMESPACE_CONFIG.items():
if self._namespace_exists(namespace):
logger.debug(
f"Namespace {self.tenant}/{namespace} already exists"
)
continue
logger.info(
f"Creating namespace {self.tenant}/{namespace}"
)
self._create_namespace(namespace, config)

View file

@ -0,0 +1,93 @@
"""
TemplateSeed initialiser populates the reserved ``__template__``
workspace from an external JSON seed file.
Seed file shape:
.. code-block:: json
{
"flow-blueprint": {
"ontology": { ... },
"agent": { ... }
},
"prompt": {
...
},
...
}
Top-level keys are config types; nested keys are config entries.
Values are arbitrary JSON (they'll be ``json.dumps()``'d on write).
Parameters
----------
config_file : str
Path to the seed file on disk.
overwrite : bool (default False)
On re-run (flag change), if True overwrite all keys; if False
upsert-missing-only (preserves any operator customisation of
the template).
"""
import json
from .. base import Initialiser
TEMPLATE_WORKSPACE = "__template__"
class TemplateSeed(Initialiser):
def __init__(self, config_file, overwrite=False, **kwargs):
super().__init__(**kwargs)
if not config_file:
raise ValueError("TemplateSeed requires 'config_file'")
self.config_file = config_file
self.overwrite = overwrite
async def run(self, ctx, old_flag, new_flag):
with open(self.config_file) as f:
seed = json.load(f)
if old_flag is None:
# Clean first run — write every entry.
await self._write_all(ctx, seed)
return
# Re-run after flag change.
if self.overwrite:
await self._write_all(ctx, seed)
else:
await self._upsert_missing(ctx, seed)
async def _write_all(self, ctx, seed):
values = []
for type_name, entries in seed.items():
for key, value in entries.items():
values.append((type_name, key, json.dumps(value)))
if values:
await ctx.config.put_many(TEMPLATE_WORKSPACE, values)
ctx.logger.info(
f"Template seeded with {len(values)} entries"
)
async def _upsert_missing(self, ctx, seed):
written = 0
for type_name, entries in seed.items():
existing = set(
await ctx.config.keys(TEMPLATE_WORKSPACE, type_name)
)
values = []
for key, value in entries.items():
if key not in existing:
values.append(
(type_name, key, json.dumps(value))
)
if values:
await ctx.config.put_many(TEMPLATE_WORKSPACE, values)
written += len(values)
ctx.logger.info(
f"Template upsert-missing: {written} new entries"
)

View file

@ -0,0 +1,138 @@
"""
WorkspaceInit initialiser creates a workspace and populates it from
either the ``__template__`` workspace or a seed file on disk.
Parameters
----------
workspace : str
Target workspace to create / populate.
source : str
Either ``"template"`` (copy the full contents of the
``__template__`` workspace) or ``"seed-file"`` (read from
``seed_file``).
seed_file : str (required when source=="seed-file")
Path to a JSON seed file with the same shape TemplateSeed consumes.
overwrite : bool (default False)
On re-run (flag change), if True overwrite all keys; if False,
upsert-missing-only (preserves in-workspace customisations).
Raises (in ``run``)
-------------------
When source is ``"template"``, raises ``RuntimeError`` if the
``__template__`` workspace is empty indicating that TemplateSeed
hasn't run yet. The bootstrapper's retry loop will re-attempt on
the next cycle once the prerequisite is satisfied.
"""
import json
from .. base import Initialiser
TEMPLATE_WORKSPACE = "__template__"
class WorkspaceInit(Initialiser):
def __init__(
self,
workspace="default",
source="template",
seed_file=None,
overwrite=False,
**kwargs,
):
super().__init__(**kwargs)
if source not in ("template", "seed-file"):
raise ValueError(
f"WorkspaceInit: source must be 'template' or "
f"'seed-file', got {source!r}"
)
if source == "seed-file" and not seed_file:
raise ValueError(
"WorkspaceInit: seed_file required when source='seed-file'"
)
self.workspace = workspace
self.source = source
self.seed_file = seed_file
self.overwrite = overwrite
async def run(self, ctx, old_flag, new_flag):
if self.source == "seed-file":
tree = self._load_seed_file()
else:
tree = await self._load_from_template(ctx)
if old_flag is None or self.overwrite:
await self._write_all(ctx, tree)
else:
await self._upsert_missing(ctx, tree)
def _load_seed_file(self):
with open(self.seed_file) as f:
return json.load(f)
async def _load_from_template(self, ctx):
"""Build a seed tree from the entire ``__template__`` workspace.
Raises if the workspace is empty, so the bootstrapper knows
the prerequisite isn't met yet."""
raw_tree = await ctx.config.get_all(TEMPLATE_WORKSPACE)
tree = {}
total = 0
for type_name, entries in raw_tree.items():
parsed = {}
for key, raw in entries.items():
if raw is None:
continue
try:
parsed[key] = json.loads(raw)
except Exception:
parsed[key] = raw
total += 1
if parsed:
tree[type_name] = parsed
if total == 0:
raise RuntimeError(
"Template workspace is empty — has TemplateSeed run yet?"
)
ctx.logger.debug(
f"Loaded {total} template entries across {len(tree)} types"
)
return tree
async def _write_all(self, ctx, tree):
values = []
for type_name, entries in tree.items():
for key, value in entries.items():
values.append((type_name, key, json.dumps(value)))
if values:
await ctx.config.put_many(self.workspace, values)
ctx.logger.info(
f"Workspace {self.workspace!r} populated with "
f"{len(values)} entries"
)
async def _upsert_missing(self, ctx, tree):
written = 0
for type_name, entries in tree.items():
existing = set(
await ctx.config.keys(self.workspace, type_name)
)
values = []
for key, value in entries.items():
if key not in existing:
values.append(
(type_name, key, json.dumps(value))
)
if values:
await ctx.config.put_many(self.workspace, values)
written += len(values)
ctx.logger.info(
f"Workspace {self.workspace!r} upsert-missing: "
f"{written} new entries"
)

View file

@ -24,6 +24,21 @@ logger = logging.getLogger(__name__)
default_ident = "config-svc"
def is_reserved_workspace(workspace):
"""Reserved workspaces are storage-only.
Any workspace id beginning with ``_`` is reserved for internal use
(e.g. ``__template__`` holding factory-default seed config).
Reads and writes work normally so bootstrap and provisioning code
can use the standard config API, but **change notifications for
reserved workspaces are suppressed**. Services subscribed to the
config push therefore never see reserved-workspace events and
cannot accidentally act on template content as if it were live
state.
"""
return workspace.startswith("_")
default_config_request_queue = config_request_queue
default_config_response_queue = config_response_queue
default_config_push_queue = config_push_queue
@ -130,6 +145,21 @@ class Processor(AsyncProcessor):
async def push(self, changes=None):
# Suppress notifications from reserved workspaces (ids starting
# with "_", e.g. "__template__"). Stored config is preserved;
# only the broadcast is filtered. Keeps services oblivious to
# template / bootstrap state.
if changes:
filtered = {}
for type_name, workspaces in changes.items():
visible = [
w for w in workspaces
if not is_reserved_workspace(w)
]
if visible:
filtered[type_name] = visible
changes = filtered
version = await self.config.get_version()
resp = ConfigPush(

View file

@ -5,7 +5,7 @@ Input is text, output is embeddings vector.
"""
from ... base import EmbeddingsService
from ollama import Client
from ollama import AsyncClient
import os
import logging
@ -30,24 +30,24 @@ class Processor(EmbeddingsService):
}
)
self.client = Client(host=ollama)
self.client = AsyncClient(host=ollama)
self.default_model = model
self._checked_models = set()
def _ensure_model(self, model_name):
async def _ensure_model(self, model_name):
"""Check if model exists locally, pull it if not."""
if model_name in self._checked_models:
return
try:
self.client.show(model_name)
await self.client.show(model_name)
self._checked_models.add(model_name)
except Exception as e:
status_code = getattr(e, 'status_code', None)
if status_code == 404 or "not found" in str(e).lower():
logger.info(f"Ollama model '{model_name}' not found locally. Pulling, this may take a while...")
try:
self.client.pull(model_name)
await self.client.pull(model_name)
self._checked_models.add(model_name)
logger.info(f"Successfully pulled Ollama model '{model_name}'.")
except Exception as pull_e:
@ -63,10 +63,10 @@ class Processor(EmbeddingsService):
use_model = model or self.default_model
# Ensure the model exists/is pulled
self._ensure_model(use_model)
await self._ensure_model(use_model)
# Ollama handles batch input efficiently
embeds = self.client.embed(
embeds = await self.client.embed(
model = use_model,
input = texts
)

View file

@ -1,22 +1,371 @@
"""
IAM-backed authentication and authorisation for the API gateway.
class Authenticator:
The gateway delegates both authentication ("who is this caller?")
and authorisation ("may they do this?") to the IAM regime via the
contract specified in docs/tech-specs/iam-contract.md. No regime-
specific policy (roles, scopes, claims) lives in the gateway.
def __init__(self, token=None, allow_all=False):
- Authentication: API keys are resolved by IAM; JWTs are validated
locally against the cached signing public key.
- Authorisation: every per-request decision is asked of IAM via
``authorise(identity, capability, resource, parameters)``, with
results cached for the TTL the regime returns.
"""
if not allow_all and token is None:
raise RuntimeError("Need a token")
import asyncio
import base64
import hashlib
import json
import logging
import time
import uuid
from dataclasses import dataclass, field
if not allow_all and token == "":
raise RuntimeError("Need a token")
from aiohttp import web
self.token = token
self.allow_all = allow_all
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.primitives.asymmetric import ed25519
def permitted(self, token, roles):
from ..base.iam_client import IamClient
from ..base.metrics import ProducerMetrics, SubscriberMetrics
from ..schema import (
IamRequest, IamResponse,
iam_request_queue, iam_response_queue,
)
if self.allow_all: return True
logger = logging.getLogger("auth")
if self.token != token: return False
API_KEY_CACHE_TTL = 60 # seconds
return True
# Upper bound on cache TTL the gateway honours for an authorisation
# decision, regardless of what the regime suggested. Caps the
# revocation latency window.
AUTHZ_CACHE_TTL_MAX = 60 # seconds
@dataclass
class Identity:
"""The gateway-side surface of an authenticated caller.
Per the IAM contract this is a small fixed shape; regime-internal
state (roles, claims, group memberships) is reachable only via
the regime's ``authorise`` operation. The gateway itself never
reads policy from this object.
"""
# Opaque handle, quoted back when calling ``authorise``. For
# the OSS regime this is the user record's id; the gateway
# treats it as a string with no semantic content.
handle: str
# The workspace this credential authenticates to. Used by the
# gateway as the default-fill-in for operations that omit a
# workspace. Never used as policy input.
workspace: str
# Stable identifier for audit logs. In OSS this is the same
# value as ``handle``; not assumed equal in the contract.
principal_id: str
# How the credential was presented. Non-policy; useful for
# logs / metrics only.
source: str # "api-key" | "jwt"
def _auth_failure():
return web.HTTPUnauthorized(
text='{"error":"auth failure"}',
content_type="application/json",
)
def _access_denied():
return web.HTTPForbidden(
text='{"error":"access denied"}',
content_type="application/json",
)
def _b64url_decode(s):
pad = "=" * (-len(s) % 4)
return base64.urlsafe_b64decode(s + pad)
def _verify_jwt_eddsa(token, public_pem):
"""Verify an Ed25519 JWT and return its claims. Raises on any
validation failure. Refuses non-EdDSA algorithms."""
parts = token.split(".")
if len(parts) != 3:
raise ValueError("malformed JWT")
h_b64, p_b64, s_b64 = parts
signing_input = f"{h_b64}.{p_b64}".encode("ascii")
header = json.loads(_b64url_decode(h_b64))
if header.get("alg") != "EdDSA":
raise ValueError(f"unsupported alg: {header.get('alg')!r}")
key = serialization.load_pem_public_key(public_pem.encode("ascii"))
if not isinstance(key, ed25519.Ed25519PublicKey):
raise ValueError("public key is not Ed25519")
signature = _b64url_decode(s_b64)
key.verify(signature, signing_input) # raises InvalidSignature
claims = json.loads(_b64url_decode(p_b64))
exp = claims.get("exp")
if exp is None or exp < time.time():
raise ValueError("expired")
return claims
class IamAuth:
"""Resolves bearer credentials via the IAM service.
Used by every gateway endpoint that needs authentication. Fetches
the IAM signing public key at startup (cached in memory). API
keys are resolved via the IAM service with a local hashidentity
cache (short TTL so revoked keys stop working within the TTL
window without any push mechanism)."""
def __init__(self, backend, id="api-gateway"):
self.backend = backend
self.id = id
# Populated at start() via IAM.
self._signing_public_pem = None
# API-key cache: plaintext_sha256_hex -> (Identity, expires_ts)
self._key_cache = {}
self._key_cache_lock = asyncio.Lock()
# Authorisation decision cache: hash(handle, capability,
# resource, parameters) -> (allow_bool, expires_ts). Holds
# both allows and denies — denies cached briefly to avoid
# hammering iam-svc with repeated rejected attempts.
self._authz_cache: dict[str, tuple[bool, float]] = {}
self._authz_cache_lock = asyncio.Lock()
# ------------------------------------------------------------------
# Short-lived client helper. Mirrors the pattern used by the
# bootstrap framework and AsyncProcessor: a fresh uuid suffix per
# invocation so Pulsar exclusive subscriptions don't collide with
# ghosts from prior calls.
# ------------------------------------------------------------------
def _make_client(self):
rr_id = str(uuid.uuid4())
return IamClient(
backend=self.backend,
subscription=f"{self.id}--iam--{rr_id}",
consumer_name=self.id,
request_topic=iam_request_queue,
request_schema=IamRequest,
request_metrics=ProducerMetrics(
processor=self.id, flow=None, name="iam-request",
),
response_topic=iam_response_queue,
response_schema=IamResponse,
response_metrics=SubscriberMetrics(
processor=self.id, flow=None, name="iam-response",
),
)
async def _with_client(self, op):
"""Open a short-lived IamClient, run ``op(client)``, close."""
client = self._make_client()
await client.start()
try:
return await op(client)
finally:
try:
await client.stop()
except Exception:
pass
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
async def start(self, max_retries=30, retry_delay=2.0):
"""Fetch the signing public key from IAM. Retries on
failure the gateway may be starting before IAM is ready."""
async def _fetch(client):
return await client.get_signing_key_public()
for attempt in range(max_retries):
try:
pem = await self._with_client(_fetch)
if pem:
self._signing_public_pem = pem
logger.info(
"IamAuth: fetched IAM signing public key "
f"({len(pem)} bytes)"
)
return
except Exception as e:
logger.info(
f"IamAuth: waiting for IAM signing key "
f"({type(e).__name__}: {e}); "
f"retry {attempt + 1}/{max_retries}"
)
await asyncio.sleep(retry_delay)
# Don't prevent startup forever. A later authenticate() call
# will try again via the JWT path.
logger.warning(
"IamAuth: could not fetch IAM signing key at startup; "
"JWT validation will fail until it's available"
)
# ------------------------------------------------------------------
# Authentication
# ------------------------------------------------------------------
async def authenticate(self, request):
"""Extract and validate the Bearer credential from an HTTP
request. Returns an ``Identity``. Raises HTTPUnauthorized
(401 / "auth failure") on any failure mode the caller
cannot distinguish missing / malformed / invalid / expired /
revoked credentials."""
header = request.headers.get("Authorization", "")
if not header.startswith("Bearer "):
raise _auth_failure()
token = header[len("Bearer "):].strip()
if not token:
raise _auth_failure()
# API keys always start with "tg_". JWTs have two dots and
# no "tg_" prefix. Discriminate cheaply.
if token.startswith("tg_"):
return await self._resolve_api_key(token)
if token.count(".") == 2:
return self._verify_jwt(token)
raise _auth_failure()
def _verify_jwt(self, token):
if not self._signing_public_pem:
raise _auth_failure()
try:
claims = _verify_jwt_eddsa(token, self._signing_public_pem)
except Exception as e:
logger.debug(f"JWT validation failed: {type(e).__name__}: {e}")
raise _auth_failure()
sub = claims.get("sub", "")
ws = claims.get("workspace", "")
if not sub or not ws:
raise _auth_failure()
# JWT carries no policy state under the IAM contract;
# any roles / claims field is ignored here.
return Identity(
handle=sub, workspace=ws, principal_id=sub, source="jwt",
)
async def _resolve_api_key(self, plaintext):
h = hashlib.sha256(plaintext.encode("utf-8")).hexdigest()
cached = self._key_cache.get(h)
now = time.time()
if cached and cached[1] > now:
return cached[0]
async with self._key_cache_lock:
cached = self._key_cache.get(h)
if cached and cached[1] > now:
return cached[0]
try:
async def _call(client):
return await client.resolve_api_key(plaintext)
# ``roles`` is returned by the OSS regime as a hint
# but is not consulted by the gateway; all policy
# decisions go through ``authorise``.
user_id, workspace, _roles = await self._with_client(_call)
except Exception as e:
logger.debug(
f"API key resolution failed: "
f"{type(e).__name__}: {e}"
)
raise _auth_failure()
if not user_id or not workspace:
raise _auth_failure()
identity = Identity(
handle=user_id, workspace=workspace,
principal_id=user_id, source="api-key",
)
self._key_cache[h] = (identity, now + API_KEY_CACHE_TTL)
return identity
# ------------------------------------------------------------------
# Authorisation
# ------------------------------------------------------------------
@staticmethod
def _authz_cache_key(handle, capability, resource, parameters):
payload = json.dumps(
{
"h": handle,
"c": capability,
"r": resource or {},
"p": parameters or {},
},
sort_keys=True,
separators=(",", ":"),
)
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
async def authorise(self, identity, capability, resource, parameters):
"""Ask the IAM regime whether ``identity`` may perform
``capability`` on ``resource`` given ``parameters``.
Caches the decision for the regime's suggested TTL, clamped
above by ``AUTHZ_CACHE_TTL_MAX``. Both allow and deny
decisions are cached (denies briefly, to avoid hammering
iam-svc with repeated rejected attempts).
Raises ``HTTPForbidden`` (403 / "access denied") on a deny
decision. Raises ``HTTPUnauthorized`` (401 / "auth failure")
if the IAM service errors out failing closed."""
key = self._authz_cache_key(
identity.handle, capability, resource, parameters,
)
now = time.time()
cached = self._authz_cache.get(key)
if cached and cached[1] > now:
allow, _ = cached
if not allow:
raise _access_denied()
return
async with self._authz_cache_lock:
cached = self._authz_cache.get(key)
if cached and cached[1] > now:
allow, _ = cached
if not allow:
raise _access_denied()
return
try:
async def _call(client):
return await client.authorise(
identity.handle, capability,
resource or {}, parameters or {},
)
allow, ttl = await self._with_client(_call)
except Exception as e:
logger.warning(
f"authorise failed: {type(e).__name__}: {e}; "
f"failing closed for "
f"{identity.principal_id!r} cap={capability!r}"
)
raise _auth_failure()
ttl = max(0, min(int(ttl or 0), AUTHZ_CACHE_TTL_MAX))
self._authz_cache[key] = (bool(allow), now + ttl)
if not allow:
raise _access_denied()
return

View file

@ -0,0 +1,100 @@
"""
Gateway-side authorisation entry points.
Under the IAM contract (see docs/tech-specs/iam-contract.md) the
gateway holds *no* policy state. Roles, capability sets, and
workspace-scope rules all live in the IAM regime (iam-svc for OSS).
This module is the thin surface the gateway uses to ask the regime
for a decision:
- ``PUBLIC`` / ``AUTHENTICATED`` sentinels for endpoints that don't
go through capability-based authorisation.
- :func:`enforce` authenticate-only, then ask the regime.
- :func:`enforce_workspace` default-fill the workspace from the
caller's bound workspace and ask the regime, with the workspace
treated as the resource address.
The capability strings themselves are an open vocabulary see
docs/tech-specs/capabilities.md. The gateway does not validate them
beyond passing them through; an unknown capability simply produces a
deny verdict from the regime.
"""
from aiohttp import web
PUBLIC = "__public__"
AUTHENTICATED = "__authenticated__"
def access_denied():
return web.HTTPForbidden(
text='{"error":"access denied"}',
content_type="application/json",
)
def auth_failure():
return web.HTTPUnauthorized(
text='{"error":"auth failure"}',
content_type="application/json",
)
async def enforce(request, auth, capability):
"""Authenticate the caller and (for non-sentinel capabilities)
ask the IAM regime whether they may invoke ``capability``.
The resource is system-level (``{}``) and parameters are empty
use :func:`enforce_workspace` for workspace-scoped endpoints, or
drive authorisation through the operation registry for richer
cases.
- ``PUBLIC``: returns ``None`` no authentication.
- ``AUTHENTICATED``: returns the ``Identity`` no authorisation.
- capability string: returns the ``Identity`` if the regime
allows; raises ``HTTPForbidden`` otherwise.
"""
if capability == PUBLIC:
return None
identity = await auth.authenticate(request)
if capability == AUTHENTICATED:
return identity
await auth.authorise(identity, capability, {}, {})
return identity
async def enforce_workspace(data, identity, auth, capability=None):
"""Default-fill the workspace on a request body and (optionally)
authorise the caller for ``capability`` against that workspace.
- Target workspace = ``data["workspace"]`` if supplied, else the
caller's bound workspace.
- On success, ``data["workspace"]`` is overwritten with the
resolved value so downstream code sees a single canonical
address.
- When ``capability`` is given, the regime is asked whether the
caller may invoke ``capability`` on ``{workspace: target}``.
Raises ``HTTPForbidden`` on a deny.
For ``capability=None`` no authorisation call is made the
caller has presumably already authorised via :func:`enforce`
(handy for endpoints that authorise once then resolve workspace
on the body before forwarding).
"""
if not isinstance(data, dict):
return data
requested = data.get("workspace", "")
target = requested or identity.workspace
data["workspace"] = target
if capability is not None:
await auth.authorise(
identity, capability, {"workspace": target}, {},
)
return data

View file

@ -0,0 +1,40 @@
from ... schema import IamRequest, IamResponse
from ... schema import iam_request_queue, iam_response_queue
from ... messaging import TranslatorRegistry
from . requestor import ServiceRequestor
class IamRequestor(ServiceRequestor):
def __init__(self, backend, consumer, subscriber, timeout=120,
request_queue=None, response_queue=None):
if request_queue is None:
request_queue = iam_request_queue
if response_queue is None:
response_queue = iam_response_queue
super().__init__(
backend=backend,
consumer_name=consumer,
subscription=subscriber,
request_queue=request_queue,
response_queue=response_queue,
request_schema=IamRequest,
response_schema=IamResponse,
timeout=timeout,
)
self.request_translator = (
TranslatorRegistry.get_request_translator("iam")
)
self.response_translator = (
TranslatorRegistry.get_response_translator("iam")
)
def to_request(self, body):
return self.request_translator.decode(body)
def from_response(self, message):
return self.response_translator.encode_with_completion(message)

View file

@ -9,6 +9,7 @@ logger = logging.getLogger(__name__)
from . config import ConfigRequestor
from . flow import FlowRequestor
from . iam import IamRequestor
from . librarian import LibrarianRequestor
from . knowledge import KnowledgeRequestor
from . collection_management import CollectionManagementRequestor
@ -72,6 +73,7 @@ request_response_dispatchers = {
global_dispatchers = {
"config": ConfigRequestor,
"flow": FlowRequestor,
"iam": IamRequestor,
"librarian": LibrarianRequestor,
"knowledge": KnowledgeRequestor,
"collection-management": CollectionManagementRequestor,
@ -105,13 +107,31 @@ class DispatcherWrapper:
class DispatcherManager:
def __init__(self, backend, config_receiver, prefix="api-gateway",
queue_overrides=None):
def __init__(self, backend, config_receiver, auth,
prefix="api-gateway", queue_overrides=None):
"""
``auth`` is required. It flows into the Mux for first-frame
WebSocket authentication and into downstream dispatcher
construction. There is no permissive default constructing
a DispatcherManager without an authenticator would be a
silent downgrade to no-auth on the socket path.
"""
if auth is None:
raise ValueError(
"DispatcherManager requires an 'auth' argument — there "
"is no no-auth mode"
)
self.backend = backend
self.config_receiver = config_receiver
self.config_receiver.add_handler(self)
self.prefix = prefix
# Gateway IamAuth — used by the socket Mux for first-frame
# auth and by any dispatcher that needs to resolve caller
# identity out-of-band.
self.auth = auth
# Store queue overrides for global services
# Format: {"config": {"request": "...", "response": "..."}, ...}
self.queue_overrides = queue_overrides or {}
@ -163,6 +183,15 @@ class DispatcherManager:
def dispatch_global_service(self):
return DispatcherWrapper(self.process_global_service)
def dispatch_auth_iam(self):
"""Pre-configured IAM dispatcher for the gateway's auth
endpoints (login, bootstrap, change-password). Pins the
kind to ``iam`` so these handlers don't have to supply URL
params the global dispatcher would expect."""
async def _process(data, responder):
return await self.invoke_global_service(data, responder, "iam")
return DispatcherWrapper(_process)
def dispatch_core_export(self):
return DispatcherWrapper(self.process_core_export)
@ -314,7 +343,10 @@ class DispatcherManager:
async def process_socket(self, ws, running, params):
dispatcher = Mux(self, ws, running)
# The mux self-authenticates via the first-frame protocol;
# pass the gateway's IamAuth so it can validate tokens
# without reaching back into the endpoint layer.
dispatcher = Mux(self, ws, running, auth=self.auth)
return dispatcher

View file

@ -16,11 +16,28 @@ MAX_QUEUE_SIZE = 10
class Mux:
def __init__(self, dispatcher_manager, ws, running):
def __init__(self, dispatcher_manager, ws, running, auth):
"""
``auth`` is required the Mux implements the first-frame
auth protocol described in ``iam.md`` and will refuse any
non-auth frame until an ``auth-ok`` has been issued. There
is no no-auth mode.
"""
if auth is None:
raise ValueError(
"Mux requires an 'auth' argument — there is no "
"no-auth mode"
)
self.dispatcher_manager = dispatcher_manager
self.ws = ws
self.running = running
self.auth = auth
# Authenticated identity, populated by the first-frame auth
# protocol. ``None`` means the socket is not yet
# authenticated; any non-auth frame is refused.
self.identity = None
self.q = asyncio.Queue(maxsize=MAX_QUEUE_SIZE)
@ -31,6 +48,41 @@ class Mux:
if self.ws:
await self.ws.close()
async def _handle_auth_frame(self, data):
"""Process a ``{"type": "auth", "token": "..."}`` frame.
On success, updates ``self.identity`` and returns an
``auth-ok`` response frame. On failure, returns the masked
auth-failure frame. Never raises auth failures keep the
socket open so the client can retry without reconnecting
(important for browsers, which treat a handshake-time 401
as terminal)."""
token = data.get("token", "")
if not token:
await self.ws.send_json({
"type": "auth-failed",
"error": "auth failure",
})
return
class _Shim:
def __init__(self, tok):
self.headers = {"Authorization": f"Bearer {tok}"}
try:
identity = await self.auth.authenticate(_Shim(token))
except Exception:
await self.ws.send_json({
"type": "auth-failed",
"error": "auth failure",
})
return
self.identity = identity
await self.ws.send_json({
"type": "auth-ok",
"workspace": identity.workspace,
})
async def receive(self, msg):
request_id = None
@ -38,6 +90,16 @@ class Mux:
try:
data = msg.json()
# In-band auth protocol: the client sends
# ``{"type": "auth", "token": "..."}`` as its first frame
# (and any time it wants to re-auth: JWT refresh, token
# rotation, etc). Auth is always required on a Mux —
# there is no no-auth mode.
if isinstance(data, dict) and data.get("type") == "auth":
await self._handle_auth_frame(data)
return
request_id = data.get("id")
if "request" not in data:
@ -46,9 +108,125 @@ class Mux:
if "id" not in data:
raise RuntimeError("Bad message")
# Reject all non-auth frames until an ``auth-ok`` has
# been issued.
if self.identity is None:
await self.ws.send_json({
"id": request_id,
"error": {
"message": "auth failure",
"type": "auth-required",
},
"complete": True,
})
return
# Per-service capability gating. Resolved through the
# operation registry so the WS path matches what HTTP
# callers see — same authority, same caps.
#
# Lookup mirrors the HTTP routing decision in
# ``request_task``: presence of ``flow`` on the envelope
# means a flow-level data-plane service (graph-rag,
# agent, …); absence means a workspace-level service
# (config, flow management, librarian, …) whose specific
# operation is in the inner request body. ``iam`` is
# treated as workspace-level too — its operations are
# registered with bare names, no kind prefix.
from ..registry import lookup as _registry_lookup
from ..capabilities import enforce_workspace
from aiohttp import web as _web
service = data.get("service", "")
inner = data.get("request") or {}
inner_op = inner.get("operation", "") if isinstance(inner, dict) else ""
if data.get("flow"):
op = _registry_lookup(f"flow-service:{service}")
elif service == "iam":
op = _registry_lookup(inner_op) if inner_op else None
else:
op = _registry_lookup(f"{service}:{inner_op}") if inner_op else None
if op is None:
await self.ws.send_json({
"id": request_id,
"error": {
"message": "unknown service",
"type": "unknown-service",
},
"complete": True,
})
return
# Resolve workspace first (default-fill from the caller's
# bound workspace), then ask the regime to authorise the
# service-level capability against the matched
# operation's resource shape.
try:
await enforce_workspace(data, self.identity, self.auth)
if isinstance(inner, dict):
await enforce_workspace(inner, self.identity, self.auth)
if data.get("flow"):
resource = {
"workspace": data.get("workspace", ""),
"flow": data.get("flow", ""),
}
parameters = {}
else:
# Build a minimal RequestContext so the matched
# operation's own extractors decide resource and
# parameters — same path the HTTP endpoints take.
from ..registry import RequestContext
ctx = RequestContext(
body=inner if isinstance(inner, dict) else {},
match_info={},
identity=self.identity,
)
resource = op.extract_resource(ctx)
parameters = op.extract_parameters(ctx)
await self.auth.authorise(
self.identity, op.capability, resource, parameters,
)
except _web.HTTPForbidden:
await self.ws.send_json({
"id": request_id,
"error": {
"message": "access denied",
"type": "access-denied",
},
"complete": True,
})
return
except _web.HTTPUnauthorized:
await self.ws.send_json({
"id": request_id,
"error": {
"message": "auth failure",
"type": "auth-required",
},
"complete": True,
})
return
workspace = data["workspace"]
# Plumb authenticated caller's handle as ``actor`` so
# iam-svc handlers (whoami, future actor-scoped checks)
# know who is calling. Overwrite any caller-supplied
# value so it can't be spoofed over the WS.
if (
service == "iam"
and isinstance(data.get("request"), dict)
and self.identity is not None
):
data["request"]["actor"] = self.identity.handle
await self.q.put((
data["id"],
data.get("workspace", "default"),
workspace,
data.get("flow"),
data["service"],
data["request"]

View file

@ -0,0 +1,131 @@
"""
Gateway auth endpoints.
Three dedicated paths:
POST /api/v1/auth/login unauthenticated; username/password JWT
POST /api/v1/auth/bootstrap unauthenticated; IAM bootstrap op
POST /api/v1/auth/change-password authenticated; any role
These are the only IAM-surface operations that can be reached from
outside. Everything else routes through ``/api/v1/iam`` gated by
``users:admin``.
"""
import logging
from aiohttp import web
from .. capabilities import enforce, PUBLIC, AUTHENTICATED
logger = logging.getLogger("auth-endpoints")
logger.setLevel(logging.INFO)
class AuthEndpoints:
"""Groups the three auth-surface handlers. Each forwards to the
IAM service via the existing ``IamRequestor`` dispatcher."""
def __init__(self, iam_dispatcher, auth):
self.iam = iam_dispatcher
self.auth = auth
async def start(self):
pass
def add_routes(self, app):
app.add_routes([
web.post("/api/v1/auth/login", self.login),
web.post("/api/v1/auth/bootstrap", self.bootstrap),
web.post(
"/api/v1/auth/bootstrap-status",
self.bootstrap_status,
),
web.post(
"/api/v1/auth/change-password",
self.change_password,
),
])
async def _forward(self, body):
async def responder(x, fin):
pass
return await self.iam.process(body, responder)
async def login(self, request):
"""Public. Accepts {username, password, workspace?}. Returns
{jwt, jwt_expires} on success; IAM's masked auth failure on
anything else."""
await enforce(request, self.auth, PUBLIC)
try:
body = await request.json()
except Exception:
return web.json_response(
{"error": "invalid json"}, status=400,
)
req = {
"operation": "login",
"username": body.get("username", ""),
"password": body.get("password", ""),
"workspace": body.get("workspace", ""),
}
resp = await self._forward(req)
if "error" in resp:
return web.json_response(
{"error": "auth failure"}, status=401,
)
return web.json_response(resp)
async def bootstrap(self, request):
"""Public. Valid only when IAM is running in bootstrap mode
with empty tables. In every other case the IAM service
returns a masked auth-failure."""
await enforce(request, self.auth, PUBLIC)
resp = await self._forward({"operation": "bootstrap"})
if "error" in resp:
return web.json_response(
{"error": "auth failure"}, status=401,
)
return web.json_response(resp)
async def bootstrap_status(self, request):
"""Public, side-effect-free. Returns ``{"bootstrap_available":
bool}`` so a UI can decide whether to render first-run setup
without invoking the consuming ``bootstrap`` op."""
await enforce(request, self.auth, PUBLIC)
resp = await self._forward({"operation": "bootstrap-status"})
if "error" in resp:
return web.json_response(
{"error": "auth failure"}, status=401,
)
return web.json_response(resp)
async def change_password(self, request):
"""Authenticated (any role). Accepts {current_password,
new_password}; user_id is taken from the authenticated
identity the caller cannot change someone else's password
this way (reset-password is the admin path)."""
identity = await enforce(request, self.auth, AUTHENTICATED)
try:
body = await request.json()
except Exception:
return web.json_response(
{"error": "invalid json"}, status=400,
)
req = {
"operation": "change-password",
"user_id": identity.handle,
"password": body.get("current_password", ""),
"new_password": body.get("new_password", ""),
}
resp = await self._forward(req)
if "error" in resp:
err_type = resp.get("error", {}).get("type", "")
if err_type == "auth-failed":
return web.json_response(
{"error": "auth failure"}, status=401,
)
return web.json_response(
{"error": resp.get("error", {}).get("message", "error")},
status=400,
)
return web.json_response(resp)

View file

@ -1,28 +1,27 @@
import asyncio
from aiohttp import web
import uuid
import logging
from aiohttp import web
from .. capabilities import enforce, enforce_workspace
logger = logging.getLogger("endpoint")
logger.setLevel(logging.INFO)
class ConstantEndpoint:
def __init__(self, endpoint_path, auth, dispatcher):
def __init__(self, endpoint_path, auth, dispatcher, capability):
self.path = endpoint_path
self.auth = auth
self.operation = "service"
self.capability = capability
self.dispatcher = dispatcher
async def start(self):
pass
def add_routes(self, app):
app.add_routes([
web.post(self.path, self.handle),
])
@ -31,22 +30,14 @@ class ConstantEndpoint:
logger.debug(f"Processing request: {request.path}")
try:
ht = request.headers["Authorization"]
tokens = ht.split(" ", 2)
if tokens[0] != "Bearer":
return web.HTTPUnauthorized()
token = tokens[1]
except:
token = ""
if not self.auth.permitted(token, self.operation):
return web.HTTPUnauthorized()
identity = await enforce(request, self.auth, self.capability)
try:
data = await request.json()
if identity is not None:
await enforce_workspace(data, identity, self.auth)
async def responder(x, fin):
pass
@ -54,10 +45,8 @@ class ConstantEndpoint:
return web.json_response(resp)
except web.HTTPException:
raise
except Exception as e:
logging.error(f"Exception: {e}")
return web.json_response(
{ "error": str(e) }
)
logger.error(f"Exception: {e}", exc_info=True)
return web.json_response({"error": str(e)})

View file

@ -4,16 +4,18 @@ from aiohttp import web
from trustgraph.i18n import get_language_pack
from .. capabilities import enforce
logger = logging.getLogger("endpoint")
logger.setLevel(logging.INFO)
class I18nPackEndpoint:
def __init__(self, endpoint_path: str, auth):
def __init__(self, endpoint_path: str, auth, capability):
self.path = endpoint_path
self.auth = auth
self.operation = "service"
self.capability = capability
async def start(self):
pass
@ -26,26 +28,13 @@ class I18nPackEndpoint:
async def handle(self, request):
logger.debug(f"Processing i18n pack request: {request.path}")
token = ""
try:
ht = request.headers["Authorization"]
tokens = ht.split(" ", 2)
if tokens[0] != "Bearer":
return web.HTTPUnauthorized()
token = tokens[1]
except Exception:
token = ""
if not self.auth.permitted(token, self.operation):
return web.HTTPUnauthorized()
await enforce(request, self.auth, self.capability)
lang = request.match_info.get("lang") or "en"
# This is a path traversal defense, and is a critical sec defense.
# Do not remove!
# Path-traversal defense — critical, do not remove.
if "/" in lang or ".." in lang:
return web.HTTPBadRequest(reason="Invalid language code")
pack = get_language_pack(lang)
return web.json_response(pack)

View file

@ -0,0 +1,114 @@
"""
Registry-driven /api/v1/iam endpoint.
The gateway no longer gates IAM management with a single coarse
``users:admin`` capability. Instead, each operation declares its
own capability + resource shape in the registry (``registry.py``);
this endpoint reads the body's ``operation`` field, looks up the
declaration, and asks the IAM regime to authorise the call.
Operations not in the registry produce a 400 ``unknown operation``.
This is the gateway's primary mechanism for fail-closed gating of
the IAM surface the registry is the source of truth.
"""
import logging
from aiohttp import web
from .. capabilities import (
PUBLIC, AUTHENTICATED, auth_failure,
)
from .. registry import lookup, RequestContext
logger = logging.getLogger("iam-endpoint")
logger.setLevel(logging.INFO)
class IamEndpoint:
"""POST /api/v1/iam — generic forwarder gated by the operation
registry. The IAM dispatcher (``iam_dispatcher``) forwards the
body verbatim to iam-svc once authorisation succeeds."""
def __init__(self, endpoint_path, auth, dispatcher):
self.path = endpoint_path
self.auth = auth
self.dispatcher = dispatcher
async def start(self):
pass
def add_routes(self, app):
app.add_routes([web.post(self.path, self.handle)])
async def handle(self, request):
try:
body = await request.json()
except Exception:
return web.json_response(
{"error": "invalid json"}, status=400,
)
if not isinstance(body, dict):
return web.json_response(
{"error": "body must be an object"}, status=400,
)
op_name = body.get("operation", "")
op = lookup(op_name)
if op is None:
return web.json_response(
{"error": "unknown operation"}, status=400,
)
# Authentication: required for everything except PUBLIC.
identity = None
if op.capability != PUBLIC:
try:
identity = await self.auth.authenticate(request)
except web.HTTPException:
raise
# Authorisation: capability sentinels short-circuit the
# regime call; capability strings go through authorise().
if op.capability not in (PUBLIC, AUTHENTICATED):
ctx = RequestContext(
body=body,
match_info=dict(request.match_info),
identity=identity,
)
try:
resource = op.extract_resource(ctx)
parameters = op.extract_parameters(ctx)
except Exception as e:
logger.warning(
f"extractor failed for {op_name!r}: "
f"{type(e).__name__}: {e}"
)
return web.json_response(
{"error": "bad request"}, status=400,
)
await self.auth.authorise(
identity, op.capability, resource, parameters,
)
# Plumb the authenticated caller's handle through as ``actor``
# so iam-svc handlers (e.g. whoami, future actor-scoped
# checks) know who is making the request. The gateway is
# the only authority for this — body-supplied ``actor``
# values are overwritten so callers can't impersonate.
if identity is not None:
body["actor"] = identity.handle
async def responder(x, fin):
pass
try:
resp = await self.dispatcher.process(body, responder)
except web.HTTPException:
raise
except Exception as e:
logger.error(f"Exception: {e}", exc_info=True)
return web.json_response({"error": str(e)})
return web.json_response(resp)

View file

@ -8,72 +8,269 @@ from . variable_endpoint import VariableEndpoint
from . socket import SocketEndpoint
from . metrics import MetricsEndpoint
from . i18n import I18nPackEndpoint
from . auth_endpoints import AuthEndpoints
from . iam_endpoint import IamEndpoint
from . registry_endpoint import RegistryRoutedVariableEndpoint
from .. capabilities import PUBLIC, AUTHENTICATED, auth_failure
from .. registry import lookup as _registry_lookup, RequestContext
from .. dispatch.manager import DispatcherManager
# /api/v1/{kind} (config / flow / librarian / knowledge /
# collection-management), /api/v1/iam, and /api/v1/flow/{flow}/...
# routes are all gated per-operation by the registry, not by a
# per-kind capability map. Login / bootstrap / change-password are
# served by AuthEndpoints with their own PUBLIC / AUTHENTICATED
# sentinels.
import logging as _mgr_logging
_mgr_logger = _mgr_logging.getLogger("endpoint")
class _RoutedVariableEndpoint:
"""HTTP endpoint that gates per request via the operation
registry. The URL's ``kind`` parameter combined with a fixed
``registry_prefix`` yields the registry key e.g. prefix
``flow-service`` and kind ``agent`` looks up
``flow-service:agent``.
Used for ``/api/v1/flow/{flow}/service/{kind}`` (per-flow
data-plane services). ``/api/v1/{kind}`` (workspace-level
global services) goes through ``RegistryRoutedVariableEndpoint``
which discriminates on body operation as well as URL kind."""
def __init__(self, endpoint_path, auth, dispatcher, registry_prefix):
self.path = endpoint_path
self.auth = auth
self.dispatcher = dispatcher
self._registry_prefix = registry_prefix
async def start(self):
pass
def add_routes(self, app):
app.add_routes([web.post(self.path, self.handle)])
async def handle(self, request):
kind = request.match_info.get("kind", "")
op = _registry_lookup(f"{self._registry_prefix}:{kind}")
if op is None:
return web.json_response(
{"error": "unknown kind"}, status=404,
)
identity = await self.auth.authenticate(request)
try:
data = await request.json()
ctx = RequestContext(
body=data if isinstance(data, dict) else {},
match_info=dict(request.match_info),
identity=identity,
)
resource = op.extract_resource(ctx)
parameters = op.extract_parameters(ctx)
await self.auth.authorise(
identity, op.capability, resource, parameters,
)
async def responder(x, fin):
pass
resp = await self.dispatcher.process(
data, responder, request.match_info,
)
return web.json_response(resp)
except web.HTTPException:
raise
except Exception as e:
_mgr_logger.error(f"Exception: {e}", exc_info=True)
return web.json_response({"error": str(e)})
class _RoutedSocketEndpoint:
"""WebSocket endpoint gated per request via the operation
registry. Like ``_RoutedVariableEndpoint`` but for the
streaming flow import / export socket paths."""
def __init__(self, endpoint_path, auth, dispatcher, registry_prefix):
self.path = endpoint_path
self.auth = auth
self.dispatcher = dispatcher
self._registry_prefix = registry_prefix
async def start(self):
pass
def add_routes(self, app):
app.add_routes([web.get(self.path, self.handle)])
async def handle(self, request):
kind = request.match_info.get("kind", "")
op = _registry_lookup(f"{self._registry_prefix}:{kind}")
if op is None:
return web.json_response(
{"error": "unknown kind"}, status=404,
)
token = request.query.get("token", "")
if not token:
return auth_failure()
from . socket import _QueryTokenRequest
try:
identity = await self.auth.authenticate(
_QueryTokenRequest(token)
)
except web.HTTPException as e:
return e
ctx = RequestContext(
body={},
match_info=dict(request.match_info),
identity=identity,
)
try:
resource = op.extract_resource(ctx)
parameters = op.extract_parameters(ctx)
await self.auth.authorise(
identity, op.capability, resource, parameters,
)
except web.HTTPException as e:
return e
# Delegate the websocket handling to a standalone SocketEndpoint
# with the resolved capability, bypassing the per-request mutation
# concern by instantiating fresh state.
ws_ep = SocketEndpoint(
endpoint_path=self.path,
auth=self.auth,
dispatcher=self.dispatcher,
capability=op.capability,
)
return await ws_ep.handle(request)
class EndpointManager:
def __init__(
self, dispatcher_manager, auth, prometheus_url, timeout=600
self, dispatcher_manager, auth, prometheus_url, timeout=600,
):
self.dispatcher_manager = dispatcher_manager
self.timeout = timeout
self.services = {
}
self.endpoints = [
# Auth surface — public / authenticated-any. Must come
# before the generic /api/v1/{kind} routes to win the
# match for /api/v1/auth/* paths. aiohttp routes in
# registration order, so we prepend here.
AuthEndpoints(
iam_dispatcher=dispatcher_manager.dispatch_auth_iam(),
auth=auth,
),
# /api/v1/iam — registry-driven IAM management. Per
# operation gating happens inside IamEndpoint via the
# operation registry; the dispatcher forwards verbatim
# to iam-svc once authorisation has succeeded. Listed
# before the generic /api/v1/{kind} route so it wins
# the match for "iam".
IamEndpoint(
endpoint_path="/api/v1/iam",
auth=auth,
dispatcher=dispatcher_manager.dispatch_auth_iam(),
),
I18nPackEndpoint(
endpoint_path = "/api/v1/i18n/packs/{lang}",
auth = auth,
endpoint_path="/api/v1/i18n/packs/{lang}",
auth=auth,
capability=PUBLIC,
),
MetricsEndpoint(
endpoint_path = "/api/metrics",
prometheus_url = prometheus_url,
auth = auth,
endpoint_path="/api/metrics",
prometheus_url=prometheus_url,
auth=auth,
capability="metrics:read",
),
VariableEndpoint(
endpoint_path = "/api/v1/{kind}", auth = auth,
dispatcher = dispatcher_manager.dispatch_global_service(),
# Global services: registry-driven per-operation gating.
# Each kind+op combination has a registry entry that
# declares its capability and resource shape. Listed
# after the IAM and auth-surface routes; aiohttp's
# path matcher prefers the more-specific path so this
# variable route doesn't shadow them.
RegistryRoutedVariableEndpoint(
endpoint_path="/api/v1/{kind}",
auth=auth,
dispatcher=dispatcher_manager.dispatch_global_service(),
),
# /api/v1/socket: WebSocket handshake accepts
# unconditionally; the Mux dispatcher runs the
# first-frame auth protocol. Handshake-time 401s break
# browser reconnection, so authentication is always
# in-band for this endpoint.
SocketEndpoint(
endpoint_path = "/api/v1/socket",
auth = auth,
dispatcher = dispatcher_manager.dispatch_socket()
endpoint_path="/api/v1/socket",
auth=auth,
dispatcher=dispatcher_manager.dispatch_socket(),
capability=AUTHENTICATED, # informational only; bypassed
in_band_auth=True,
),
VariableEndpoint(
endpoint_path = "/api/v1/flow/{flow}/service/{kind}",
auth = auth,
dispatcher = dispatcher_manager.dispatch_flow_service(),
# Per-flow request/response services — gated per
# ``flow-service:<kind>`` registry entry.
_RoutedVariableEndpoint(
endpoint_path="/api/v1/flow/{flow}/service/{kind}",
auth=auth,
dispatcher=dispatcher_manager.dispatch_flow_service(),
registry_prefix="flow-service",
),
SocketEndpoint(
endpoint_path = "/api/v1/flow/{flow}/import/{kind}",
auth = auth,
dispatcher = dispatcher_manager.dispatch_flow_import()
# Per-flow streaming import/export — gated per
# ``flow-import:<kind>`` / ``flow-export:<kind>`` registry
# entry.
_RoutedSocketEndpoint(
endpoint_path="/api/v1/flow/{flow}/import/{kind}",
auth=auth,
dispatcher=dispatcher_manager.dispatch_flow_import(),
registry_prefix="flow-import",
),
SocketEndpoint(
endpoint_path = "/api/v1/flow/{flow}/export/{kind}",
auth = auth,
dispatcher = dispatcher_manager.dispatch_flow_export()
_RoutedSocketEndpoint(
endpoint_path="/api/v1/flow/{flow}/export/{kind}",
auth=auth,
dispatcher=dispatcher_manager.dispatch_flow_export(),
registry_prefix="flow-export",
),
StreamEndpoint(
endpoint_path="/api/v1/import-core",
auth=auth,
method="POST",
dispatcher=dispatcher_manager.dispatch_core_import(),
# Cross-subject import — require the admin bundle via a
# single representative capability.
capability="users:admin",
),
StreamEndpoint(
endpoint_path = "/api/v1/import-core",
auth = auth,
method = "POST",
dispatcher = dispatcher_manager.dispatch_core_import(),
endpoint_path="/api/v1/export-core",
auth=auth,
method="GET",
dispatcher=dispatcher_manager.dispatch_core_export(),
capability="users:admin",
),
StreamEndpoint(
endpoint_path = "/api/v1/export-core",
auth = auth,
method = "GET",
dispatcher = dispatcher_manager.dispatch_core_export(),
),
StreamEndpoint(
endpoint_path = "/api/v1/document-stream",
auth = auth,
method = "GET",
dispatcher = dispatcher_manager.dispatch_document_stream(),
endpoint_path="/api/v1/document-stream",
auth=auth,
method="GET",
dispatcher=dispatcher_manager.dispatch_document_stream(),
capability="documents:read",
),
]
@ -84,4 +281,3 @@ class EndpointManager:
async def start(self):
for ep in self.endpoints:
await ep.start()

View file

@ -10,17 +10,19 @@ import asyncio
import uuid
import logging
from .. capabilities import enforce
logger = logging.getLogger("endpoint")
logger.setLevel(logging.INFO)
class MetricsEndpoint:
def __init__(self, prometheus_url, endpoint_path, auth):
def __init__(self, prometheus_url, endpoint_path, auth, capability):
self.prometheus_url = prometheus_url
self.path = endpoint_path
self.auth = auth
self.operation = "service"
self.capability = capability
async def start(self):
pass
@ -35,38 +37,39 @@ class MetricsEndpoint:
logger.debug(f"Processing metrics request: {request.path}")
try:
ht = request.headers["Authorization"]
tokens = ht.split(" ", 2)
if tokens[0] != "Bearer":
return web.HTTPUnauthorized()
token = tokens[1]
except:
token = ""
await enforce(request, self.auth, self.capability)
if not self.auth.permitted(token, self.operation):
return web.HTTPUnauthorized()
path = request.match_info["path"]
url = (
self.prometheus_url + "/api/v1/" + path + "?" +
request.query_string
)
try:
path = request.match_info["path"]
async with aiohttp.ClientSession() as session:
url = (
self.prometheus_url + "/api/v1/" + path + "?" +
request.query_string
)
async with session.get(url) as resp:
return web.Response(
status=resp.status,
text=await resp.text()
)
except aiohttp.ClientConnectionError as e:
# Upstream unreachable (connect refused, DNS failure,
# server disconnect). Distinguish from our own errors so
# callers know where the fault is.
logger.error(f"Metrics upstream {url} unreachable: {e}")
return web.Response(
status=502,
text=f"Bad Gateway: metrics upstream unreachable: {e}",
)
except Exception as e:
logging.error(f"Exception: {e}")
raise web.HTTPInternalServerError()
logger.error(f"Metrics proxy exception: {e}", exc_info=True)
return web.Response(
status=500,
text=f"Internal Server Error: {e}",
)

View file

@ -0,0 +1,123 @@
"""
Registry-driven dispatch for ``/api/v1/{kind}`` global services.
The body's ``operation`` field plus the URL's ``{kind}`` together
form the canonical operation name (``<kind>:<operation>``) that the
gateway looks up in ``registry.py``. The matched operation
declares its capability and resource shape; this endpoint asks the
IAM regime to authorise the call before forwarding the body
verbatim to the backend dispatcher.
The dispatcher is the same ``dispatch_global_service()`` factory the
old coarse path used; only the gating layer has changed.
Operations not present in the registry are rejected with 400
``unknown operation`` fail closed.
"""
import logging
from aiohttp import web
from .. capabilities import (
PUBLIC, AUTHENTICATED, auth_failure,
)
from .. registry import lookup, RequestContext
logger = logging.getLogger("registry-endpoint")
logger.setLevel(logging.INFO)
class RegistryRoutedVariableEndpoint:
"""POST /api/v1/{kind} — kind comes from the URL, operation comes
from the body, both are joined as the registry key."""
def __init__(self, endpoint_path, auth, dispatcher):
self.path = endpoint_path
self.auth = auth
self.dispatcher = dispatcher
async def start(self):
pass
def add_routes(self, app):
app.add_routes([web.post(self.path, self.handle)])
async def handle(self, request):
kind = request.match_info.get("kind", "")
if not kind:
return web.json_response(
{"error": "missing kind"}, status=404,
)
try:
body = await request.json()
except Exception:
return web.json_response(
{"error": "invalid json"}, status=400,
)
if not isinstance(body, dict):
return web.json_response(
{"error": "body must be an object"}, status=400,
)
op_name = body.get("operation", "")
if not op_name:
return web.json_response(
{"error": "missing operation"}, status=400,
)
registry_key = f"{kind}:{op_name}"
op = lookup(registry_key)
if op is None:
return web.json_response(
{"error": "unknown operation"}, status=400,
)
identity = None
if op.capability != PUBLIC:
identity = await self.auth.authenticate(request)
if op.capability not in (PUBLIC, AUTHENTICATED):
ctx = RequestContext(
body=body,
match_info=dict(request.match_info),
identity=identity,
)
try:
resource = op.extract_resource(ctx)
parameters = op.extract_parameters(ctx)
except Exception as e:
logger.warning(
f"extractor failed for {registry_key!r}: "
f"{type(e).__name__}: {e}"
)
return web.json_response(
{"error": "bad request"}, status=400,
)
await self.auth.authorise(
identity, op.capability, resource, parameters,
)
# Default-fill workspace into the body so downstream
# dispatchers see the canonical resolved value. The
# extractor has already pulled the workspace out;
# mirror it back to the body for the verbatim forward.
if "workspace" in resource:
body["workspace"] = resource["workspace"]
async def responder(x, fin):
pass
try:
resp = await self.dispatcher.process(
body, responder, request.match_info,
)
except web.HTTPException:
raise
except Exception as e:
logger.error(f"Exception: {e}", exc_info=True)
return web.json_response({"error": str(e)})
return web.json_response(resp)

View file

@ -4,6 +4,9 @@ from aiohttp import web, WSMsgType
import logging
from .. running import Running
from .. capabilities import (
PUBLIC, AUTHENTICATED, auth_failure,
)
logger = logging.getLogger("socket")
logger.setLevel(logging.INFO)
@ -11,12 +14,25 @@ logger.setLevel(logging.INFO)
class SocketEndpoint:
def __init__(
self, endpoint_path, auth, dispatcher,
self, endpoint_path, auth, dispatcher, capability,
in_band_auth=False,
):
"""
``in_band_auth=True`` skips the handshake-time auth check.
The WebSocket handshake always succeeds; the dispatcher is
expected to gate itself via the first-frame auth protocol
(see ``Mux``).
This avoids the browser problem where a 401 on the handshake
is treated as permanent and prevents reconnection, and lets
long-lived sockets refresh their credential mid-session by
sending a new auth frame.
"""
self.path = endpoint_path
self.auth = auth
self.operation = "socket"
self.capability = capability
self.in_band_auth = in_band_auth
self.dispatcher = dispatcher
@ -61,15 +77,33 @@ class SocketEndpoint:
raise
async def handle(self, request):
"""Enhanced handler with better cleanup"""
try:
token = request.query['token']
except:
token = ""
"""Enhanced handler with better cleanup.
Auth: WebSocket clients pass the bearer token on the
``?token=...`` query string; we wrap it into a synthetic
Authorization header before delegating to the standard auth
path so the IAM-backed flow (JWT / API key) applies uniformly.
The first-frame auth protocol described in the IAM spec is
a future upgrade."""
if not self.in_band_auth and self.capability != PUBLIC:
token = request.query.get("token", "")
if not token:
return auth_failure()
try:
identity = await self.auth.authenticate(
_QueryTokenRequest(token)
)
except web.HTTPException as e:
return e
if self.capability != AUTHENTICATED:
try:
await self.auth.authorise(
identity, self.capability, {}, {},
)
except web.HTTPException as e:
return e
if not self.auth.permitted(token, self.operation):
return web.HTTPUnauthorized()
# 50MB max message size
ws = web.WebSocketResponse(max_msg_size=52428800)
@ -150,3 +184,11 @@ class SocketEndpoint:
web.get(self.path, self.handle),
])
class _QueryTokenRequest:
"""Minimal shim that exposes headers["Authorization"] to
IamAuth.authenticate(), derived from a query-string token."""
def __init__(self, token):
self.headers = {"Authorization": f"Bearer {token}"}

View file

@ -1,82 +1,64 @@
import asyncio
from aiohttp import web
import logging
from aiohttp import web
from .. capabilities import enforce
logger = logging.getLogger("endpoint")
logger.setLevel(logging.INFO)
class StreamEndpoint:
def __init__(self, endpoint_path, auth, dispatcher, method="POST"):
def __init__(
self, endpoint_path, auth, dispatcher, capability, method="POST",
):
self.path = endpoint_path
self.auth = auth
self.operation = "service"
self.capability = capability
self.method = method
self.dispatcher = dispatcher
async def start(self):
pass
def add_routes(self, app):
if self.method == "POST":
app.add_routes([
web.post(self.path, self.handle),
])
app.add_routes([web.post(self.path, self.handle)])
elif self.method == "GET":
app.add_routes([
web.get(self.path, self.handle),
])
app.add_routes([web.get(self.path, self.handle)])
else:
raise RuntimeError("Bad method" + self.method)
raise RuntimeError("Bad method " + self.method)
async def handle(self, request):
logger.debug(f"Processing request: {request.path}")
try:
ht = request.headers["Authorization"]
tokens = ht.split(" ", 2)
if tokens[0] != "Bearer":
return web.HTTPUnauthorized()
token = tokens[1]
except:
token = ""
if not self.auth.permitted(token, self.operation):
return web.HTTPUnauthorized()
await enforce(request, self.auth, self.capability)
try:
data = request.content
async def error(err):
return web.HTTPInternalServerError(text = err)
return web.HTTPInternalServerError(text=err)
async def ok(
status=200, reason="OK", type="application/octet-stream"
status=200, reason="OK",
type="application/octet-stream",
):
response = web.StreamResponse(
status = status, reason = reason,
headers = {"Content-Type": type}
status=status, reason=reason,
headers={"Content-Type": type},
)
await response.prepare(request)
return response
resp = await self.dispatcher.process(
data, error, ok, request
)
resp = await self.dispatcher.process(data, error, ok, request)
return resp
except web.HTTPException:
raise
except Exception as e:
logging.error(f"Exception: {e}")
return web.json_response(
{ "error": str(e) }
)
logger.error(f"Exception: {e}", exc_info=True)
return web.json_response({"error": str(e)})

View file

@ -1,27 +1,27 @@
import asyncio
from aiohttp import web
import logging
from aiohttp import web
from .. capabilities import enforce, enforce_workspace
logger = logging.getLogger("endpoint")
logger.setLevel(logging.INFO)
class VariableEndpoint:
def __init__(self, endpoint_path, auth, dispatcher):
def __init__(self, endpoint_path, auth, dispatcher, capability):
self.path = endpoint_path
self.auth = auth
self.operation = "service"
self.capability = capability
self.dispatcher = dispatcher
async def start(self):
pass
def add_routes(self, app):
app.add_routes([
web.post(self.path, self.handle),
])
@ -30,35 +30,25 @@ class VariableEndpoint:
logger.debug(f"Processing request: {request.path}")
try:
ht = request.headers["Authorization"]
tokens = ht.split(" ", 2)
if tokens[0] != "Bearer":
return web.HTTPUnauthorized()
token = tokens[1]
except:
token = ""
if not self.auth.permitted(token, self.operation):
return web.HTTPUnauthorized()
identity = await enforce(request, self.auth, self.capability)
try:
data = await request.json()
if identity is not None:
await enforce_workspace(data, identity, self.auth)
async def responder(x, fin):
pass
resp = await self.dispatcher.process(
data, responder, request.match_info
data, responder, request.match_info,
)
return web.json_response(resp)
except web.HTTPException:
raise
except Exception as e:
logging.error(f"Exception: {e}")
return web.json_response(
{ "error": str(e) }
)
logger.error(f"Exception: {e}", exc_info=True)
return web.json_response({"error": str(e)})

View file

@ -0,0 +1,533 @@
"""
Gateway operation registry.
Single declarative table mapping each operation the gateway
recognises to:
- The capability the IAM regime is asked to authorise against.
- The resource level (system / workspace / flow) determines the
shape of the resource identifier handed to ``authorise``.
- Extractors that build the resource and parameters from the
request context.
This is a gateway-internal concept. It is not part of the IAM
contract the contract specifies what arguments ``authorise``
receives; the registry is how the gateway populates them.
See docs/tech-specs/iam-contract.md for the contract and
docs/tech-specs/iam.md for the request anatomy.
"""
from dataclasses import dataclass, field
from typing import Any, Callable
# Sentinels for operations that don't go through capability-based
# authorisation. Mirror the values used in capabilities.py so the
# gateway endpoint layer can recognise them uniformly.
PUBLIC = "__public__"
AUTHENTICATED = "__authenticated__"
class ResourceLevel:
"""Where the operation's resource lives.
``SYSTEM`` operation acts on a deployment-level resource
(the user registry, the workspace registry,
the signing key). resource = {}. Workspace,
if relevant, is a parameter, not an address.
``WORKSPACE`` operation acts on something within a workspace
(config, library, knowledge, collections, flow
lifecycle). resource = {workspace}.
``FLOW`` operation acts on something within a flow
within a workspace (graph, agent, llm, etc.).
resource = {workspace, flow}.
"""
SYSTEM = "system"
WORKSPACE = "workspace"
FLOW = "flow"
@dataclass
class RequestContext:
"""The bundle of inputs the registry's extractors operate on.
Assembled by the gateway from the incoming request after
authentication."""
# Parsed JSON body (HTTP) or inner request payload (WebSocket).
body: dict = field(default_factory=dict)
# URL path components (HTTP) or WebSocket envelope routing
# fields (id, service, workspace, flow).
match_info: dict = field(default_factory=dict)
# Authenticated identity for default-fill-in. Always present
# by the time extractors run, except for PUBLIC operations
# where it is None.
identity: Any = None
@dataclass
class Operation:
"""Declared operation the gateway can dispatch + authorise."""
# Canonical operation name (used for registry lookup, audit,
# debug logs). Mirrors the operation strings in the IAM
# service and other backends where applicable.
name: str
# Capability required to invoke this operation. Either a
# string from the capability vocabulary in capabilities.md, or
# the PUBLIC / AUTHENTICATED sentinel for operations that
# don't go through capability-based authorisation.
capability: str
# Where the operation's resource lives. Determines the
# shape of the resource argument passed to authorise.
resource_level: str
# Build the resource identifier from the request context.
# Returns a dict with the appropriate components for the
# resource level: {} for SYSTEM, {workspace} for WORKSPACE,
# {workspace, flow} for FLOW. Default-fill-in of workspace
# from identity.workspace happens here when applicable.
extract_resource: Callable[[RequestContext], dict]
# Build the parameters dict — decision-relevant fields the
# operation supplied that are not part of the resource
# address. E.g. workspace association on a system-level
# user-registry operation.
extract_parameters: Callable[[RequestContext], dict]
# ---------------------------------------------------------------------------
# Registry storage.
# ---------------------------------------------------------------------------
_REGISTRY: dict[str, Operation] = {}
def register(op: Operation) -> None:
if op.name in _REGISTRY:
raise RuntimeError(
f"operation {op.name!r} already registered"
)
_REGISTRY[op.name] = op
def lookup(name: str) -> Operation | None:
return _REGISTRY.get(name)
def all_operations() -> list[Operation]:
return list(_REGISTRY.values())
# ---------------------------------------------------------------------------
# Common extractor helpers.
# ---------------------------------------------------------------------------
def _empty_resource(_ctx: RequestContext) -> dict:
"""System-level resource: empty dict."""
return {}
def _workspace_from_body(ctx: RequestContext) -> dict:
"""Workspace-level resource sourced from the request body's
workspace field, defaulting to the caller's bound workspace."""
ws = (ctx.body.get("workspace") if isinstance(ctx.body, dict) else "")
if not ws and ctx.identity is not None:
ws = ctx.identity.workspace
return {"workspace": ws}
def _flow_from_match_info(ctx: RequestContext) -> dict:
"""Flow-level resource sourced from URL path components or WS
envelope fields. Both ``workspace`` and ``flow`` are required;
no default-fill-in (the address is the operation's identity)."""
return {
"workspace": ctx.match_info.get("workspace", ""),
"flow": ctx.match_info.get("flow", ""),
}
def _no_parameters(_ctx: RequestContext) -> dict:
return {}
def _body_as_parameters(ctx: RequestContext) -> dict:
"""All body fields are parameters — used when the operation's
body is small and uniformly decision-relevant (e.g. user-
registry ops where the body's user.workspace is what the
regime checks against the admin's scope)."""
return dict(ctx.body) if isinstance(ctx.body, dict) else {}
def _workspace_param_only(ctx: RequestContext) -> dict:
"""Parameters dict carrying only the workspace association.
Used by system-level operations (e.g. user-registry ops) where
the workspace isn't part of the resource address but is the
field the regime uses to scope the admin's authority.
Pulls the workspace from the inner ``user`` / ``workspace_record``
body field if present (create-user, create-workspace), then from
the top-level body, then from the caller's bound workspace."""
body = ctx.body if isinstance(ctx.body, dict) else {}
inner_user = body.get("user") if isinstance(body.get("user"), dict) else {}
inner_ws = (
body.get("workspace_record")
if isinstance(body.get("workspace_record"), dict) else {}
)
ws = (
inner_user.get("workspace")
or inner_ws.get("id")
or body.get("workspace")
)
if not ws and ctx.identity is not None:
ws = ctx.identity.workspace
return {"workspace": ws or ""}
# ---------------------------------------------------------------------------
# Operation registrations.
#
# The gateway looks operations up by their canonical name (the same
# string the request body / WS envelope carries in its ``operation``
# field where applicable). Auth-surface operations (login, bootstrap,
# change-password) are not listed here — they have their own routes
# in auth_endpoints.py and use PUBLIC / AUTHENTICATED sentinels
# directly. Pure gateway↔IAM internal operations (resolve-api-key,
# authorise, authorise-many, get-signing-key-public) are likewise
# excluded; they are never invoked over the public API.
# ---------------------------------------------------------------------------
# IAM management operations. All routed through /api/v1/iam, body
# carries ``operation`` plus operation-specific fields.
# User registry: SYSTEM-level resource (users are global, identified
# by handle). The admin's authority is scoped per workspace via the
# parameters {workspace} field — that's what the regime checks
# against the admin's role workspace_scope.
register(Operation(
name="create-user",
capability="users:admin",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_workspace_param_only,
))
register(Operation(
name="list-users",
capability="users:read",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_workspace_param_only,
))
register(Operation(
name="get-user",
capability="users:read",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_workspace_param_only,
))
register(Operation(
name="update-user",
capability="users:write",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_workspace_param_only,
))
register(Operation(
name="disable-user",
capability="users:admin",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_workspace_param_only,
))
register(Operation(
name="enable-user",
capability="users:admin",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_workspace_param_only,
))
register(Operation(
name="delete-user",
capability="users:admin",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_workspace_param_only,
))
register(Operation(
name="reset-password",
capability="users:admin",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_workspace_param_only,
))
# API keys: SYSTEM-level resource — like users, a key record exists
# in the deployment-wide keys registry. The workspace the key
# authenticates to is a property of the record, not a containment;
# it appears as a parameter so the regime can scope the admin's
# authority to issue / list / revoke against it.
register(Operation(
name="create-api-key",
capability="keys:admin",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_workspace_param_only,
))
register(Operation(
name="list-api-keys",
capability="keys:admin",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_workspace_param_only,
))
register(Operation(
name="revoke-api-key",
capability="keys:admin",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_workspace_param_only,
))
# Workspace registry: SYSTEM-level resource (workspaces are the
# top-level addressable unit). No parameters — the workspace being
# acted on is identified by the body, not used as a scope cue.
register(Operation(
name="create-workspace",
capability="workspaces:admin",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_no_parameters,
))
register(Operation(
name="list-workspaces",
capability="workspaces:admin",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_no_parameters,
))
register(Operation(
name="get-workspace",
capability="workspaces:admin",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_no_parameters,
))
register(Operation(
name="update-workspace",
capability="workspaces:admin",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_no_parameters,
))
register(Operation(
name="disable-workspace",
capability="workspaces:admin",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_no_parameters,
))
# Signing key: SYSTEM-level operational op.
register(Operation(
name="rotate-signing-key",
capability="iam:admin",
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_no_parameters,
))
# ---------------------------------------------------------------------------
# Auth-surface entries.
#
# Listed here so the registry is the one place the gateway looks for
# operation→capability mappings — including the sentinels for paths
# that don't go through capability-based authorisation. The actual
# routing is in auth_endpoints.py; these entries let the registry-
# driven dispatcher recognise the operation if it sees it on a
# generic path.
# ---------------------------------------------------------------------------
register(Operation(
name="login",
capability=PUBLIC,
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_no_parameters,
))
register(Operation(
name="bootstrap",
capability=PUBLIC,
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_no_parameters,
))
register(Operation(
name="bootstrap-status",
capability=PUBLIC,
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_no_parameters,
))
register(Operation(
name="change-password",
capability=AUTHENTICATED,
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_no_parameters,
))
register(Operation(
name="whoami",
capability=AUTHENTICATED,
resource_level=ResourceLevel.SYSTEM,
extract_resource=_empty_resource,
extract_parameters=_no_parameters,
))
# ---------------------------------------------------------------------------
# Generic kind/operation entries.
#
# Names are ``<kind>:<operation>`` so the registry key is unique
# across dispatchers. All entries below are workspace-level
# resources (workspace defaulted from the caller's bound workspace
# if absent). Read/write distinction maps to the existing
# ``<subject>:read`` / ``<subject>:write`` capability vocabulary
# defined in capabilities.md.
# ---------------------------------------------------------------------------
def _register_kind_op(kind: str, op: str, capability: str) -> None:
"""Helper: register a workspace-level kind:op with the standard
extractors (workspace from body, no extra parameters)."""
register(Operation(
name=f"{kind}:{op}",
capability=capability,
resource_level=ResourceLevel.WORKSPACE,
extract_resource=_workspace_from_body,
extract_parameters=_no_parameters,
))
# config: KV-style workspace config service.
for _op in ("get", "list", "getvalues", "getvalues-all-ws", "config"):
_register_kind_op("config", _op, "config:read")
for _op in ("put", "delete"):
_register_kind_op("config", _op, "config:write")
# flow: flow-blueprint and flow-lifecycle service.
for _op in ("list-blueprints", "get-blueprint", "list-flows", "get-flow"):
_register_kind_op("flow", _op, "flows:read")
for _op in ("put-blueprint", "delete-blueprint", "start-flow", "stop-flow"):
_register_kind_op("flow", _op, "flows:write")
# librarian: document storage and processing service.
for _op in (
"get-document-metadata", "get-document-content",
"stream-document", "list-documents", "list-processing",
"get-upload-status", "list-uploads",
):
_register_kind_op("librarian", _op, "documents:read")
for _op in (
"add-document", "remove-document", "update-document",
"add-processing", "remove-processing",
"begin-upload", "upload-chunk", "complete-upload", "abort-upload",
):
_register_kind_op("librarian", _op, "documents:write")
# knowledge: knowledge-graph core service.
for _op in ("get-kg-core", "list-kg-cores"):
_register_kind_op("knowledge", _op, "knowledge:read")
for _op in ("put-kg-core", "delete-kg-core",
"load-kg-core", "unload-kg-core"):
_register_kind_op("knowledge", _op, "knowledge:write")
# collection-management: workspace collection lifecycle.
_register_kind_op("collection-management", "list-collections", "collections:read")
for _op in ("update-collection", "delete-collection"):
_register_kind_op("collection-management", _op, "collections:write")
# ---------------------------------------------------------------------------
# Per-flow data-plane services.
#
# /api/v1/flow/{flow}/service/{kind} and the streaming
# /api/v1/flow/{flow}/{import,export}/{kind} paths. No body-level
# ``operation`` discriminator — the URL kind is the operation
# identity. Resource is FLOW level (workspace + flow).
#
# Names: ``flow-service:<kind>``, ``flow-import:<kind>``,
# ``flow-export:<kind>``.
# ---------------------------------------------------------------------------
def _register_flow_kind(prefix: str, kind: str, capability: str) -> None:
register(Operation(
name=f"{prefix}:{kind}",
capability=capability,
resource_level=ResourceLevel.FLOW,
extract_resource=_flow_from_match_info,
extract_parameters=_no_parameters,
))
# Request/response services on /api/v1/flow/{flow}/service/{kind}.
_FLOW_SERVICES = {
"agent": "agent",
"text-completion": "llm",
"prompt": "llm",
"mcp-tool": "mcp",
"graph-rag": "graph:read",
"document-rag": "documents:read",
"embeddings": "embeddings",
"graph-embeddings": "graph:read",
"document-embeddings": "documents:read",
"triples": "graph:read",
"rows": "rows:read",
"nlp-query": "rows:read",
"structured-query": "rows:read",
"structured-diag": "rows:read",
"row-embeddings": "rows:read",
"sparql": "graph:read",
}
for _kind, _cap in _FLOW_SERVICES.items():
_register_flow_kind("flow-service", _kind, _cap)
# Streaming import socket endpoints.
_FLOW_IMPORTS = {
"triples": "graph:write",
"graph-embeddings": "graph:write",
"document-embeddings": "documents:write",
"entity-contexts": "documents:write",
"rows": "rows:write",
}
for _kind, _cap in _FLOW_IMPORTS.items():
_register_flow_kind("flow-import", _kind, _cap)
# Streaming export socket endpoints.
_FLOW_EXPORTS = {
"triples": "graph:read",
"graph-embeddings": "graph:read",
"document-embeddings": "documents:read",
"entity-contexts": "documents:read",
}
for _kind, _cap in _FLOW_EXPORTS.items():
_register_flow_kind("flow-export", _kind, _cap)

View file

@ -12,7 +12,7 @@ import os
from trustgraph.base.logging import setup_logging, add_logging_args
from trustgraph.base.pubsub import get_pubsub, add_pubsub_args
from . auth import Authenticator
from . auth import IamAuth
from . config.receiver import ConfigReceiver
from . dispatch.manager import DispatcherManager
@ -35,7 +35,6 @@ default_prometheus_url = os.getenv("PROMETHEUS_URL", "http://prometheus:9090")
default_pulsar_api_key = os.getenv("PULSAR_API_KEY", None)
default_timeout = 600
default_port = 8088
default_api_token = os.getenv("GATEWAY_SECRET", "")
class Api:
@ -60,13 +59,14 @@ class Api:
if not self.prometheus_url.endswith("/"):
self.prometheus_url += "/"
api_token = config.get("api_token", default_api_token)
# Token not set, or token equal empty string means no auth
if api_token:
self.auth = Authenticator(token=api_token)
else:
self.auth = Authenticator(allow_all=True)
# IAM-backed authentication. The legacy GATEWAY_SECRET
# shared-token path has been removed — there is no
# "open for everyone" fallback. The gateway cannot
# authenticate any request until IAM is reachable.
self.auth = IamAuth(
backend=self.pubsub_backend,
id=config.get("id", "api-gateway"),
)
self.config_receiver = ConfigReceiver(self.pubsub_backend)
@ -118,6 +118,7 @@ class Api:
config_receiver = self.config_receiver,
prefix = "gateway",
queue_overrides = queue_overrides,
auth = self.auth,
)
self.endpoint_manager = EndpointManager(
@ -132,12 +133,18 @@ class Api:
]
async def app_factory(self):
self.app = web.Application(
middlewares=[],
client_max_size=256 * 1024 * 1024
)
# Fetch IAM signing public key before accepting traffic.
# Blocks for a bounded retry window; the gateway starts even
# if IAM is still unreachable (JWT validation will 401 until
# the key is available).
await self.auth.start()
await self.config_receiver.start()
for ep in self.endpoints:
@ -189,12 +196,6 @@ def run():
help=f'API request timeout in seconds (default: {default_timeout})',
)
parser.add_argument(
'--api-token',
default=default_api_token,
help=f'Secret API token (default: no auth)',
)
add_logging_args(parser)
parser.add_argument(

View file

@ -0,0 +1 @@
from . service import *

View file

@ -0,0 +1,4 @@
from . service import run
run()

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,233 @@
"""
IAM service processor. Terminates the IAM request queue and forwards
each request to the IamService business logic, then returns the
response on the IAM response queue.
Shape mirrors trustgraph.config.service.
"""
import logging
import os
from trustgraph.schema import Error
from trustgraph.schema import IamRequest, IamResponse
from trustgraph.schema import iam_request_queue, iam_response_queue
from trustgraph.base import AsyncProcessor, Consumer, Producer
from trustgraph.base import ConsumerMetrics, ProducerMetrics
from trustgraph.base.cassandra_config import (
add_cassandra_args, resolve_cassandra_config,
)
from . iam import IamService
logger = logging.getLogger(__name__)
default_ident = "iam-svc"
default_iam_request_queue = iam_request_queue
default_iam_response_queue = iam_response_queue
# Environment variables consulted as a fallback when the
# corresponding params field is not set in the processor-group YAML
# or via CLI. Intended for K8s Secret / env-var injection so the
# bootstrap token never has to live in the YAML (and thus in git).
ENV_BOOTSTRAP_MODE = "IAM_BOOTSTRAP_MODE"
ENV_BOOTSTRAP_TOKEN = "IAM_BOOTSTRAP_TOKEN"
class Processor(AsyncProcessor):
def __init__(self, **params):
iam_req_q = params.get(
"iam_request_queue", default_iam_request_queue,
)
iam_resp_q = params.get(
"iam_response_queue", default_iam_response_queue,
)
# Resolve bootstrap mode + token. Precedence: explicit
# params (CLI / processor-group YAML) → environment variable
# → unset (fail-closed). The env-var path is the K8s-native
# injection point: an `IAM_BOOTSTRAP_TOKEN` from a Secret
# never has to land in the YAML, and therefore never enters
# git history.
bootstrap_mode = (
params.get("bootstrap_mode")
or os.environ.get(ENV_BOOTSTRAP_MODE)
)
bootstrap_token = (
params.get("bootstrap_token")
or os.environ.get(ENV_BOOTSTRAP_TOKEN)
)
if bootstrap_mode not in ("token", "bootstrap"):
raise RuntimeError(
"iam-svc: bootstrap-mode is required. Set to 'token' "
"(with bootstrap-token) for production, or 'bootstrap' "
"to enable the explicit bootstrap operation over the "
"pub/sub bus (dev / quick-start only, not safe under "
"public exposure). Configurable via processor-group "
f"params or the {ENV_BOOTSTRAP_MODE} environment "
"variable. Refusing to start."
)
if bootstrap_mode == "token" and not bootstrap_token:
raise RuntimeError(
"iam-svc: bootstrap-mode=token requires bootstrap-token "
f"(or the {ENV_BOOTSTRAP_TOKEN} environment "
"variable). Refusing to start."
)
if bootstrap_mode == "bootstrap" and bootstrap_token:
raise RuntimeError(
"iam-svc: bootstrap-token is not accepted when "
"bootstrap-mode=bootstrap. Ambiguous intent. "
"Refusing to start."
)
self.bootstrap_mode = bootstrap_mode
self.bootstrap_token = bootstrap_token
cassandra_host = params.get("cassandra_host")
cassandra_username = params.get("cassandra_username")
cassandra_password = params.get("cassandra_password")
hosts, username, password, keyspace = resolve_cassandra_config(
host=cassandra_host,
username=cassandra_username,
password=cassandra_password,
default_keyspace="iam",
)
self.cassandra_host = hosts
self.cassandra_username = username
self.cassandra_password = password
super().__init__(
**params | {
"iam_request_schema": IamRequest.__name__,
"iam_response_schema": IamResponse.__name__,
"cassandra_host": self.cassandra_host,
"cassandra_username": self.cassandra_username,
"cassandra_password": self.cassandra_password,
}
)
iam_request_metrics = ConsumerMetrics(
processor=self.id, flow=None, name="iam-request",
)
iam_response_metrics = ProducerMetrics(
processor=self.id, flow=None, name="iam-response",
)
self.iam_request_topic = iam_req_q
self.iam_request_consumer = Consumer(
taskgroup=self.taskgroup,
backend=self.pubsub,
flow=None,
topic=iam_req_q,
subscriber=self.id,
schema=IamRequest,
handler=self.on_iam_request,
metrics=iam_request_metrics,
)
self.iam_response_producer = Producer(
backend=self.pubsub,
topic=iam_resp_q,
schema=IamResponse,
metrics=iam_response_metrics,
)
self.iam = IamService(
host=self.cassandra_host,
username=self.cassandra_username,
password=self.cassandra_password,
keyspace=keyspace,
bootstrap_mode=self.bootstrap_mode,
bootstrap_token=self.bootstrap_token,
)
logger.info(
f"IAM service initialised (bootstrap-mode={self.bootstrap_mode})"
)
async def start(self):
await self.pubsub.ensure_topic(self.iam_request_topic)
# Token-mode auto-bootstrap runs before we accept requests so
# the first inbound call always sees a populated table.
await self.iam.auto_bootstrap_if_token_mode()
await self.iam_request_consumer.start()
async def on_iam_request(self, msg, consumer, flow):
id = None
try:
v = msg.value()
id = msg.properties()["id"]
logger.debug(
f"Handling IAM request {id} op={v.operation!r}"
)
resp = await self.iam.handle(v)
await self.iam_response_producer.send(
resp, properties={"id": id},
)
except Exception as e:
logger.error(
f"IAM request failed: {type(e).__name__}: {e}",
exc_info=True,
)
resp = IamResponse(
error=Error(type="internal-error", message=str(e)),
)
if id is not None:
await self.iam_response_producer.send(
resp, properties={"id": id},
)
@staticmethod
def add_args(parser):
AsyncProcessor.add_args(parser)
parser.add_argument(
"--iam-request-queue",
default=default_iam_request_queue,
help=f"IAM request queue (default: {default_iam_request_queue})",
)
parser.add_argument(
"--iam-response-queue",
default=default_iam_response_queue,
help=f"IAM response queue (default: {default_iam_response_queue})",
)
parser.add_argument(
"--bootstrap-mode",
default=None,
choices=["token", "bootstrap"],
help=(
"IAM bootstrap mode (required). "
"'token' = operator supplies the initial admin API "
"key via --bootstrap-token; auto-seeds on first start, "
"bootstrap operation refused. "
"'bootstrap' = bootstrap operation is live over the "
"bus until tables are populated; a token is generated "
"and returned by tg-bootstrap-iam. Unsafe to run "
"'bootstrap' mode with public exposure."
),
)
parser.add_argument(
"--bootstrap-token",
default=None,
help=(
"Initial admin API key plaintext, required when "
"--bootstrap-mode=token. Treat as a one-time "
"credential: the operator should rotate to a new key "
"and revoke this one after first use."
),
)
add_cassandra_args(parser)
def run():
Processor.launch(default_ident, __doc__)

View file

@ -4,7 +4,7 @@ Simple LLM service, performs text prompt completion using an Ollama service.
Input is prompt, output is response.
"""
from ollama import Client
from ollama import AsyncClient
import os
import logging
@ -38,23 +38,23 @@ class Processor(LlmService):
self.default_model = model
self.temperature = temperature
self.llm = Client(host=ollama)
self.llm = AsyncClient(host=ollama)
self._checked_models = set()
def _ensure_model(self, model_name):
async def _ensure_model(self, model_name):
"""Check if model exists locally, pull it if not."""
if model_name in self._checked_models:
return
try:
self.llm.show(model_name)
await self.llm.show(model_name)
self._checked_models.add(model_name)
except Exception as e:
status_code = getattr(e, 'status_code', None)
if status_code == 404 or "not found" in str(e).lower():
logger.info(f"Ollama model '{model_name}' not found locally. Pulling, this may take a while...")
try:
self.llm.pull(model_name)
await self.llm.pull(model_name)
self._checked_models.add(model_name)
logger.info(f"Successfully pulled Ollama model '{model_name}'.")
except Exception as pull_e:
@ -66,9 +66,9 @@ class Processor(LlmService):
# Use provided model or fall back to default
model_name = model or self.default_model
# Ensure the model exists/is pulled
self._ensure_model(model_name)
await self._ensure_model(model_name)
# Use provided temperature or fall back to default
effective_temperature = temperature if temperature is not None else self.temperature
@ -79,7 +79,7 @@ class Processor(LlmService):
try:
response = self.llm.generate(model_name, prompt, options={'temperature': effective_temperature})
response = await self.llm.generate(model_name, prompt, options={'temperature': effective_temperature})
response_text = response['response']
logger.debug("Sending response...")
@ -113,7 +113,7 @@ class Processor(LlmService):
model_name = model or self.default_model
# Ensure the model exists/is pulled
self._ensure_model(model_name)
await self._ensure_model(model_name)
effective_temperature = temperature if temperature is not None else self.temperature
@ -123,7 +123,7 @@ class Processor(LlmService):
prompt = system + "\n\n" + prompt
try:
stream = self.llm.generate(
stream = await self.llm.generate(
model_name,
prompt,
options={'temperature': effective_temperature},
@ -133,7 +133,7 @@ class Processor(LlmService):
total_input_tokens = 0
total_output_tokens = 0
for chunk in stream:
async for chunk in stream:
if 'response' in chunk and chunk['response']:
yield LlmChunk(
text=chunk['response'],

View file

@ -0,0 +1,436 @@
"""
IAM Cassandra table store.
Tables:
- iam_workspaces (id primary key)
- iam_users (id primary key) + iam_users_by_username lookup table
(workspace, username) -> id
- iam_api_keys (key_hash primary key) with secondary index on user_id
- iam_signing_keys (kid primary key) RSA keypairs for JWT signing
See docs/tech-specs/iam-protocol.md for the wire-level context.
"""
import logging
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from ssl import SSLContext, PROTOCOL_TLSv1_2
from . cassandra_async import async_execute
logger = logging.getLogger(__name__)
class IamTableStore:
def __init__(
self,
cassandra_host, cassandra_username, cassandra_password,
keyspace,
):
self.keyspace = keyspace
logger.info("IAM: connecting to Cassandra...")
if isinstance(cassandra_host, str):
cassandra_host = [h.strip() for h in cassandra_host.split(",")]
if cassandra_username and cassandra_password:
ssl_context = SSLContext(PROTOCOL_TLSv1_2)
auth_provider = PlainTextAuthProvider(
username=cassandra_username, password=cassandra_password,
)
self.cluster = Cluster(
cassandra_host,
auth_provider=auth_provider,
ssl_context=ssl_context,
)
else:
self.cluster = Cluster(cassandra_host)
self.cassandra = self.cluster.connect()
logger.info("IAM: connected.")
self._ensure_schema()
self._prepare_statements()
def _ensure_schema(self):
# FIXME: Replication factor should be configurable.
self.cassandra.execute(f"""
create keyspace if not exists {self.keyspace}
with replication = {{
'class' : 'SimpleStrategy',
'replication_factor' : 1
}};
""")
self.cassandra.set_keyspace(self.keyspace)
self.cassandra.execute("""
CREATE TABLE IF NOT EXISTS iam_workspaces (
id text PRIMARY KEY,
name text,
enabled boolean,
created timestamp
);
""")
self.cassandra.execute("""
CREATE TABLE IF NOT EXISTS iam_users (
id text PRIMARY KEY,
workspace text,
username text,
name text,
email text,
password_hash text,
roles set<text>,
enabled boolean,
must_change_password boolean,
created timestamp
);
""")
self.cassandra.execute("""
CREATE TABLE IF NOT EXISTS iam_users_by_username (
workspace text,
username text,
user_id text,
PRIMARY KEY ((workspace), username)
);
""")
self.cassandra.execute("""
CREATE TABLE IF NOT EXISTS iam_api_keys (
key_hash text PRIMARY KEY,
id text,
user_id text,
name text,
prefix text,
expires timestamp,
created timestamp,
last_used timestamp
);
""")
self.cassandra.execute("""
CREATE INDEX IF NOT EXISTS iam_api_keys_user_id_idx
ON iam_api_keys (user_id);
""")
self.cassandra.execute("""
CREATE INDEX IF NOT EXISTS iam_api_keys_id_idx
ON iam_api_keys (id);
""")
self.cassandra.execute("""
CREATE TABLE IF NOT EXISTS iam_signing_keys (
kid text PRIMARY KEY,
private_pem text,
public_pem text,
created timestamp,
retired timestamp
);
""")
logger.info("IAM: Cassandra schema OK.")
def _prepare_statements(self):
c = self.cassandra
self.put_workspace_stmt = c.prepare("""
INSERT INTO iam_workspaces (id, name, enabled, created)
VALUES (?, ?, ?, ?)
""")
self.get_workspace_stmt = c.prepare("""
SELECT id, name, enabled, created FROM iam_workspaces
WHERE id = ?
""")
self.list_workspaces_stmt = c.prepare("""
SELECT id, name, enabled, created FROM iam_workspaces
""")
self.put_user_stmt = c.prepare("""
INSERT INTO iam_users (
id, workspace, username, name, email, password_hash,
roles, enabled, must_change_password, created
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""")
self.get_user_stmt = c.prepare("""
SELECT id, workspace, username, name, email, password_hash,
roles, enabled, must_change_password, created
FROM iam_users WHERE id = ?
""")
self.list_users_by_workspace_stmt = c.prepare("""
SELECT id, workspace, username, name, email, password_hash,
roles, enabled, must_change_password, created
FROM iam_users WHERE workspace = ? ALLOW FILTERING
""")
self.list_users_stmt = c.prepare("""
SELECT id, workspace, username, name, email, password_hash,
roles, enabled, must_change_password, created
FROM iam_users
""")
self.put_username_lookup_stmt = c.prepare("""
INSERT INTO iam_users_by_username (workspace, username, user_id)
VALUES (?, ?, ?)
""")
self.get_user_id_by_username_stmt = c.prepare("""
SELECT user_id FROM iam_users_by_username
WHERE workspace = ? AND username = ?
""")
self.delete_username_lookup_stmt = c.prepare("""
DELETE FROM iam_users_by_username
WHERE workspace = ? AND username = ?
""")
self.delete_user_stmt = c.prepare("""
DELETE FROM iam_users WHERE id = ?
""")
self.put_api_key_stmt = c.prepare("""
INSERT INTO iam_api_keys (
key_hash, id, user_id, name, prefix, expires,
created, last_used
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""")
self.get_api_key_by_hash_stmt = c.prepare("""
SELECT key_hash, id, user_id, name, prefix, expires,
created, last_used
FROM iam_api_keys WHERE key_hash = ?
""")
self.get_api_key_by_id_stmt = c.prepare("""
SELECT key_hash, id, user_id, name, prefix, expires,
created, last_used
FROM iam_api_keys WHERE id = ?
""")
self.list_api_keys_by_user_stmt = c.prepare("""
SELECT key_hash, id, user_id, name, prefix, expires,
created, last_used
FROM iam_api_keys WHERE user_id = ?
""")
self.delete_api_key_stmt = c.prepare("""
DELETE FROM iam_api_keys WHERE key_hash = ?
""")
self.put_signing_key_stmt = c.prepare("""
INSERT INTO iam_signing_keys (
kid, private_pem, public_pem, created, retired
)
VALUES (?, ?, ?, ?, ?)
""")
self.list_signing_keys_stmt = c.prepare("""
SELECT kid, private_pem, public_pem, created, retired
FROM iam_signing_keys
""")
self.retire_signing_key_stmt = c.prepare("""
UPDATE iam_signing_keys SET retired = ? WHERE kid = ?
""")
self.update_user_profile_stmt = c.prepare("""
UPDATE iam_users
SET name = ?, email = ?, roles = ?, enabled = ?,
must_change_password = ?
WHERE id = ?
""")
self.update_user_password_stmt = c.prepare("""
UPDATE iam_users
SET password_hash = ?, must_change_password = ?
WHERE id = ?
""")
self.update_user_enabled_stmt = c.prepare("""
UPDATE iam_users SET enabled = ? WHERE id = ?
""")
self.update_workspace_stmt = c.prepare("""
UPDATE iam_workspaces SET name = ?, enabled = ?
WHERE id = ?
""")
# ------------------------------------------------------------------
# Workspaces
# ------------------------------------------------------------------
async def put_workspace(self, id, name, enabled, created):
await async_execute(
self.cassandra, self.put_workspace_stmt,
(id, name, enabled, created),
)
async def get_workspace(self, id):
rows = await async_execute(
self.cassandra, self.get_workspace_stmt, (id,),
)
return rows[0] if rows else None
async def list_workspaces(self):
return await async_execute(
self.cassandra, self.list_workspaces_stmt,
)
# ------------------------------------------------------------------
# Users
# ------------------------------------------------------------------
async def put_user(
self, id, workspace, username, name, email, password_hash,
roles, enabled, must_change_password, created,
):
await async_execute(
self.cassandra, self.put_user_stmt,
(
id, workspace, username, name, email, password_hash,
set(roles) if roles else set(),
enabled, must_change_password, created,
),
)
await async_execute(
self.cassandra, self.put_username_lookup_stmt,
(workspace, username, id),
)
async def get_user(self, id):
rows = await async_execute(
self.cassandra, self.get_user_stmt, (id,),
)
return rows[0] if rows else None
async def get_user_id_by_username(self, workspace, username):
rows = await async_execute(
self.cassandra, self.get_user_id_by_username_stmt,
(workspace, username),
)
return rows[0][0] if rows else None
async def list_users_by_workspace(self, workspace):
return await async_execute(
self.cassandra, self.list_users_by_workspace_stmt, (workspace,),
)
async def list_users(self):
"""List every user across the deployment. Used by the
system-level list-users handler when no workspace filter is
supplied; the gateway has already authorised the call against
the caller's authority."""
return await async_execute(
self.cassandra, self.list_users_stmt, (),
)
async def delete_user(self, id):
await async_execute(
self.cassandra, self.delete_user_stmt, (id,),
)
async def delete_username_lookup(self, workspace, username):
await async_execute(
self.cassandra, self.delete_username_lookup_stmt,
(workspace, username),
)
# ------------------------------------------------------------------
# API keys
# ------------------------------------------------------------------
async def put_api_key(
self, key_hash, id, user_id, name, prefix, expires,
created, last_used,
):
await async_execute(
self.cassandra, self.put_api_key_stmt,
(key_hash, id, user_id, name, prefix, expires,
created, last_used),
)
async def get_api_key_by_hash(self, key_hash):
rows = await async_execute(
self.cassandra, self.get_api_key_by_hash_stmt, (key_hash,),
)
return rows[0] if rows else None
async def get_api_key_by_id(self, id):
rows = await async_execute(
self.cassandra, self.get_api_key_by_id_stmt, (id,),
)
return rows[0] if rows else None
async def list_api_keys_by_user(self, user_id):
return await async_execute(
self.cassandra, self.list_api_keys_by_user_stmt, (user_id,),
)
async def delete_api_key(self, key_hash):
await async_execute(
self.cassandra, self.delete_api_key_stmt, (key_hash,),
)
# ------------------------------------------------------------------
# Signing keys
# ------------------------------------------------------------------
async def put_signing_key(self, kid, private_pem, public_pem,
created, retired):
await async_execute(
self.cassandra, self.put_signing_key_stmt,
(kid, private_pem, public_pem, created, retired),
)
async def list_signing_keys(self):
return await async_execute(
self.cassandra, self.list_signing_keys_stmt,
)
async def retire_signing_key(self, kid, retired):
await async_execute(
self.cassandra, self.retire_signing_key_stmt,
(retired, kid),
)
# ------------------------------------------------------------------
# User partial updates
# ------------------------------------------------------------------
async def update_user_profile(
self, id, name, email, roles, enabled, must_change_password,
):
await async_execute(
self.cassandra, self.update_user_profile_stmt,
(
name, email,
set(roles) if roles else set(),
enabled, must_change_password, id,
),
)
async def update_user_password(
self, id, password_hash, must_change_password,
):
await async_execute(
self.cassandra, self.update_user_password_stmt,
(password_hash, must_change_password, id),
)
async def update_user_enabled(self, id, enabled):
await async_execute(
self.cassandra, self.update_user_enabled_stmt,
(enabled, id),
)
# ------------------------------------------------------------------
# Workspace updates
# ------------------------------------------------------------------
async def update_workspace(self, id, name, enabled):
await async_execute(
self.cassandra, self.update_workspace_stmt,
(name, enabled, id),
)
# ------------------------------------------------------------------
# Bootstrap helpers
# ------------------------------------------------------------------
async def any_workspace_exists(self):
rows = await self.list_workspaces()
return bool(rows)