trustgraph/trustgraph-flow/trustgraph/gateway/dispatch/mux.py
cybermaggedon 9f2bfbce0c
Per-workspace queue routing for workspace-scoped services (#862)
Workspace identity is now determined by queue infrastructure instead of
message body fields, closing a privilege-escalation vector where a caller
could spoof workspace in the request payload.

- Add WorkspaceProcessor base class: discovers workspaces from config at
  startup, creates per-workspace consumers (queue:workspace), and manages
  consumer lifecycle on workspace create/delete events
- Roll out to librarian, flow-svc, knowledge cores, and config-svc
- Config service gets a dual-queue regime: a system queue for
  cross-workspace ops (getvalues-all-ws, bootstrapper writes to
  __workspaces__) and per-workspace queues for tenant-scoped ops, with
  workspace discovery from its own Cassandra store
- Remove workspace field from request schemas (FlowRequest,
  LibrarianRequest, KnowledgeRequest, CollectionManagementRequest) and
  from DocumentMetadata / ProcessingMetadata — table stores now accept
  workspace as an explicit parameter
- Strip workspace encode/decode from all message translators and gateway
  serializers
- Gateway enforces workspace existence: reject requests targeting
  non-existent workspaces instead of routing to queues with no consumer
- Config service provisions new workspaces from __template__ on creation
- Add workspace lifecycle hooks to AsyncProcessor so any processor can
  react to workspace create/delete without subclassing WorkspaceProcessor
2026-05-04 10:30:03 +01:00

387 lines
12 KiB
Python

import asyncio
import queue
import uuid
import logging
# Module logger
logger = logging.getLogger(__name__)
MAX_OUTSTANDING_REQUESTS = 15
WORKER_CLOSE_WAIT = 0.01
START_REQUEST_WAIT = 0.1
# This buffers requests until task start, so short-lived
MAX_QUEUE_SIZE = 10
class Mux:
def __init__(self, dispatcher_manager, ws, running, auth):
"""
``auth`` is required — the Mux implements the first-frame
auth protocol described in ``iam.md`` and will refuse any
non-auth frame until an ``auth-ok`` has been issued. There
is no no-auth mode.
"""
if auth is None:
raise ValueError(
"Mux requires an 'auth' argument — there is no "
"no-auth mode"
)
self.dispatcher_manager = dispatcher_manager
self.ws = ws
self.running = running
self.auth = auth
# Authenticated identity, populated by the first-frame auth
# protocol. ``None`` means the socket is not yet
# authenticated; any non-auth frame is refused.
self.identity = None
self.q = asyncio.Queue(maxsize=MAX_QUEUE_SIZE)
async def destroy(self):
self.running.stop()
if self.ws:
await self.ws.close()
async def _handle_auth_frame(self, data):
"""Process a ``{"type": "auth", "token": "..."}`` frame.
On success, updates ``self.identity`` and returns an
``auth-ok`` response frame. On failure, returns the masked
auth-failure frame. Never raises — auth failures keep the
socket open so the client can retry without reconnecting
(important for browsers, which treat a handshake-time 401
as terminal)."""
token = data.get("token", "")
if not token:
await self.ws.send_json({
"type": "auth-failed",
"error": "auth failure",
})
return
class _Shim:
def __init__(self, tok):
self.headers = {"Authorization": f"Bearer {tok}"}
try:
identity = await self.auth.authenticate(_Shim(token))
except Exception:
await self.ws.send_json({
"type": "auth-failed",
"error": "auth failure",
})
return
self.identity = identity
await self.ws.send_json({
"type": "auth-ok",
"workspace": identity.workspace,
})
async def receive(self, msg):
request_id = None
try:
data = msg.json()
# In-band auth protocol: the client sends
# ``{"type": "auth", "token": "..."}`` as its first frame
# (and any time it wants to re-auth: JWT refresh, token
# rotation, etc). Auth is always required on a Mux —
# there is no no-auth mode.
if isinstance(data, dict) and data.get("type") == "auth":
await self._handle_auth_frame(data)
return
request_id = data.get("id")
if "request" not in data:
raise RuntimeError("Bad message")
if "id" not in data:
raise RuntimeError("Bad message")
# Reject all non-auth frames until an ``auth-ok`` has
# been issued.
if self.identity is None:
await self.ws.send_json({
"id": request_id,
"error": {
"message": "auth failure",
"type": "auth-required",
},
"complete": True,
})
return
# Per-service capability gating. Resolved through the
# operation registry so the WS path matches what HTTP
# callers see — same authority, same caps.
#
# Lookup mirrors the HTTP routing decision in
# ``request_task``: presence of ``flow`` on the envelope
# means a flow-level data-plane service (graph-rag,
# agent, …); absence means a workspace-level service
# (config, flow management, librarian, …) whose specific
# operation is in the inner request body. ``iam`` is
# treated as workspace-level too — its operations are
# registered with bare names, no kind prefix.
from ..registry import lookup as _registry_lookup
from ..capabilities import enforce_workspace
from aiohttp import web as _web
service = data.get("service", "")
inner = data.get("request") or {}
inner_op = inner.get("operation", "") if isinstance(inner, dict) else ""
if data.get("flow"):
op = _registry_lookup(f"flow-service:{service}")
elif service == "iam":
op = _registry_lookup(inner_op) if inner_op else None
else:
op = _registry_lookup(f"{service}:{inner_op}") if inner_op else None
if op is None:
await self.ws.send_json({
"id": request_id,
"error": {
"message": "unknown service",
"type": "unknown-service",
},
"complete": True,
})
return
# Resolve workspace first (default-fill from the caller's
# bound workspace), then ask the regime to authorise the
# service-level capability against the matched
# operation's resource shape.
try:
await enforce_workspace(data, self.identity, self.auth)
if isinstance(inner, dict):
await enforce_workspace(inner, self.identity, self.auth)
if data.get("flow"):
resource = {
"workspace": data.get("workspace", ""),
"flow": data.get("flow", ""),
}
parameters = {}
else:
# Build a minimal RequestContext so the matched
# operation's own extractors decide resource and
# parameters — same path the HTTP endpoints take.
from ..registry import RequestContext
ctx = RequestContext(
body=inner if isinstance(inner, dict) else {},
match_info={},
identity=self.identity,
)
resource = op.extract_resource(ctx)
parameters = op.extract_parameters(ctx)
await self.auth.authorise(
self.identity, op.capability, resource, parameters,
)
except _web.HTTPNotFound:
await self.ws.send_json({
"id": request_id,
"error": {
"message": "workspace not found",
"type": "workspace-not-found",
},
"complete": True,
})
return
except _web.HTTPForbidden:
await self.ws.send_json({
"id": request_id,
"error": {
"message": "access denied",
"type": "access-denied",
},
"complete": True,
})
return
except _web.HTTPUnauthorized:
await self.ws.send_json({
"id": request_id,
"error": {
"message": "auth failure",
"type": "auth-required",
},
"complete": True,
})
return
workspace = data["workspace"]
# Plumb authenticated caller's handle as ``actor`` so
# iam-svc handlers (whoami, future actor-scoped checks)
# know who is calling. Overwrite any caller-supplied
# value so it can't be spoofed over the WS.
if (
service == "iam"
and isinstance(data.get("request"), dict)
and self.identity is not None
):
data["request"]["actor"] = self.identity.handle
await self.q.put((
data["id"],
workspace,
data.get("flow"),
data["service"],
data["request"]
))
except Exception as e:
logger.error(f"Receive exception: {str(e)}", exc_info=True)
error_resp = {
"error": {"message": str(e), "type": "error"},
"complete": True,
}
if request_id:
error_resp["id"] = request_id
await self.ws.send_json(error_resp)
async def maybe_tidy_workers(self, workers):
while True:
try:
await asyncio.wait_for(
asyncio.shield(workers[0]),
WORKER_CLOSE_WAIT
)
# worker[0] now stopped
# FIXME: Delete reference???
workers.pop(0)
if len(workers) == 0:
break
# Loop iterates to try the next worker
except TimeoutError:
# worker[0] still running, move on
break
async def start_request_task(
self, ws, id, workspace, flow, svc, request, workers,
):
# Wait for outstanding requests to go below MAX_OUTSTANDING_REQUESTS
while len(workers) > MAX_OUTSTANDING_REQUESTS:
# Fixes deadlock
# FIXME: Put it in its own loop
await asyncio.sleep(START_REQUEST_WAIT)
await self.maybe_tidy_workers(workers)
async def responder(resp, fin):
await self.ws.send_json({
"id": id,
"response": resp,
"complete": fin,
})
worker = asyncio.create_task(
self.request_task(
id, request, responder, workspace, flow, svc,
)
)
workers.append(worker)
async def request_task(
self, id, request, responder, workspace, flow, svc,
):
try:
if flow:
await self.dispatcher_manager.invoke_flow_service(
request, responder, workspace, flow, svc,
)
else:
await self.dispatcher_manager.invoke_global_service(
request, responder, svc, workspace=workspace,
)
except Exception as e:
await self.ws.send_json({
"id": id,
"error": {"message": str(e), "type": "error"},
"complete": True,
})
async def run(self):
# Worker threads, servicing
workers = []
while self.running.get():
try:
if len(workers) > 0:
await self.maybe_tidy_workers(workers)
# Get next request on queue
item = await asyncio.wait_for(self.q.get(), 1)
id, workspace, flow, svc, request = item
except TimeoutError:
continue
except Exception as e:
# This is an internal working error, may not be recoverable
logger.error(f"Run prepare exception: {e}", exc_info=True)
await self.ws.send_json({
"id": id,
"error": {"message": str(e), "type": "error"},
"complete": True,
})
self.running.stop()
if self.ws:
await self.ws.close()
self.ws = None
break
try:
await self.start_request_task(
self.ws, id, workspace, flow, svc, request, workers
)
except Exception as e:
logger.error(f"Exception in mux: {e}", exc_info=True)
await self.ws.send_json({
"id": id,
"error": {"message": str(e), "type": "error"},
"complete": True,
})
self.running.stop()
if self.ws:
await self.ws.close()
self.ws = None