mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-06 13:52:38 +02:00
Workspace identity is now determined by queue infrastructure instead of message body fields, closing a privilege-escalation vector where a caller could spoof workspace in the request payload. - Add WorkspaceProcessor base class: discovers workspaces from config at startup, creates per-workspace consumers (queue:workspace), and manages consumer lifecycle on workspace create/delete events - Roll out to librarian, flow-svc, knowledge cores, and config-svc - Config service gets a dual-queue regime: a system queue for cross-workspace ops (getvalues-all-ws, bootstrapper writes to __workspaces__) and per-workspace queues for tenant-scoped ops, with workspace discovery from its own Cassandra store - Remove workspace field from request schemas (FlowRequest, LibrarianRequest, KnowledgeRequest, CollectionManagementRequest) and from DocumentMetadata / ProcessingMetadata — table stores now accept workspace as an explicit parameter - Strip workspace encode/decode from all message translators and gateway serializers - Gateway enforces workspace existence: reject requests targeting non-existent workspaces instead of routing to queues with no consumer - Config service provisions new workspaces from __template__ on creation - Add workspace lifecycle hooks to AsyncProcessor so any processor can react to workspace create/delete without subclassing WorkspaceProcessor
387 lines
12 KiB
Python
387 lines
12 KiB
Python
|
|
import asyncio
|
|
import queue
|
|
import uuid
|
|
import logging
|
|
|
|
# Module logger
|
|
logger = logging.getLogger(__name__)
|
|
|
|
MAX_OUTSTANDING_REQUESTS = 15
|
|
WORKER_CLOSE_WAIT = 0.01
|
|
START_REQUEST_WAIT = 0.1
|
|
|
|
# This buffers requests until task start, so short-lived
|
|
MAX_QUEUE_SIZE = 10
|
|
|
|
class Mux:
|
|
|
|
def __init__(self, dispatcher_manager, ws, running, auth):
|
|
"""
|
|
``auth`` is required — the Mux implements the first-frame
|
|
auth protocol described in ``iam.md`` and will refuse any
|
|
non-auth frame until an ``auth-ok`` has been issued. There
|
|
is no no-auth mode.
|
|
"""
|
|
if auth is None:
|
|
raise ValueError(
|
|
"Mux requires an 'auth' argument — there is no "
|
|
"no-auth mode"
|
|
)
|
|
|
|
self.dispatcher_manager = dispatcher_manager
|
|
self.ws = ws
|
|
self.running = running
|
|
self.auth = auth
|
|
|
|
# Authenticated identity, populated by the first-frame auth
|
|
# protocol. ``None`` means the socket is not yet
|
|
# authenticated; any non-auth frame is refused.
|
|
self.identity = None
|
|
|
|
self.q = asyncio.Queue(maxsize=MAX_QUEUE_SIZE)
|
|
|
|
async def destroy(self):
|
|
|
|
self.running.stop()
|
|
|
|
if self.ws:
|
|
await self.ws.close()
|
|
|
|
async def _handle_auth_frame(self, data):
|
|
"""Process a ``{"type": "auth", "token": "..."}`` frame.
|
|
On success, updates ``self.identity`` and returns an
|
|
``auth-ok`` response frame. On failure, returns the masked
|
|
auth-failure frame. Never raises — auth failures keep the
|
|
socket open so the client can retry without reconnecting
|
|
(important for browsers, which treat a handshake-time 401
|
|
as terminal)."""
|
|
token = data.get("token", "")
|
|
if not token:
|
|
await self.ws.send_json({
|
|
"type": "auth-failed",
|
|
"error": "auth failure",
|
|
})
|
|
return
|
|
|
|
class _Shim:
|
|
def __init__(self, tok):
|
|
self.headers = {"Authorization": f"Bearer {tok}"}
|
|
|
|
try:
|
|
identity = await self.auth.authenticate(_Shim(token))
|
|
except Exception:
|
|
await self.ws.send_json({
|
|
"type": "auth-failed",
|
|
"error": "auth failure",
|
|
})
|
|
return
|
|
|
|
self.identity = identity
|
|
await self.ws.send_json({
|
|
"type": "auth-ok",
|
|
"workspace": identity.workspace,
|
|
})
|
|
|
|
async def receive(self, msg):
|
|
|
|
request_id = None
|
|
|
|
try:
|
|
|
|
data = msg.json()
|
|
|
|
# In-band auth protocol: the client sends
|
|
# ``{"type": "auth", "token": "..."}`` as its first frame
|
|
# (and any time it wants to re-auth: JWT refresh, token
|
|
# rotation, etc). Auth is always required on a Mux —
|
|
# there is no no-auth mode.
|
|
if isinstance(data, dict) and data.get("type") == "auth":
|
|
await self._handle_auth_frame(data)
|
|
return
|
|
|
|
request_id = data.get("id")
|
|
|
|
if "request" not in data:
|
|
raise RuntimeError("Bad message")
|
|
|
|
if "id" not in data:
|
|
raise RuntimeError("Bad message")
|
|
|
|
# Reject all non-auth frames until an ``auth-ok`` has
|
|
# been issued.
|
|
if self.identity is None:
|
|
await self.ws.send_json({
|
|
"id": request_id,
|
|
"error": {
|
|
"message": "auth failure",
|
|
"type": "auth-required",
|
|
},
|
|
"complete": True,
|
|
})
|
|
return
|
|
|
|
# Per-service capability gating. Resolved through the
|
|
# operation registry so the WS path matches what HTTP
|
|
# callers see — same authority, same caps.
|
|
#
|
|
# Lookup mirrors the HTTP routing decision in
|
|
# ``request_task``: presence of ``flow`` on the envelope
|
|
# means a flow-level data-plane service (graph-rag,
|
|
# agent, …); absence means a workspace-level service
|
|
# (config, flow management, librarian, …) whose specific
|
|
# operation is in the inner request body. ``iam`` is
|
|
# treated as workspace-level too — its operations are
|
|
# registered with bare names, no kind prefix.
|
|
from ..registry import lookup as _registry_lookup
|
|
from ..capabilities import enforce_workspace
|
|
from aiohttp import web as _web
|
|
|
|
service = data.get("service", "")
|
|
inner = data.get("request") or {}
|
|
inner_op = inner.get("operation", "") if isinstance(inner, dict) else ""
|
|
|
|
if data.get("flow"):
|
|
op = _registry_lookup(f"flow-service:{service}")
|
|
elif service == "iam":
|
|
op = _registry_lookup(inner_op) if inner_op else None
|
|
else:
|
|
op = _registry_lookup(f"{service}:{inner_op}") if inner_op else None
|
|
|
|
if op is None:
|
|
await self.ws.send_json({
|
|
"id": request_id,
|
|
"error": {
|
|
"message": "unknown service",
|
|
"type": "unknown-service",
|
|
},
|
|
"complete": True,
|
|
})
|
|
return
|
|
|
|
# Resolve workspace first (default-fill from the caller's
|
|
# bound workspace), then ask the regime to authorise the
|
|
# service-level capability against the matched
|
|
# operation's resource shape.
|
|
try:
|
|
await enforce_workspace(data, self.identity, self.auth)
|
|
if isinstance(inner, dict):
|
|
await enforce_workspace(inner, self.identity, self.auth)
|
|
|
|
if data.get("flow"):
|
|
resource = {
|
|
"workspace": data.get("workspace", ""),
|
|
"flow": data.get("flow", ""),
|
|
}
|
|
parameters = {}
|
|
else:
|
|
# Build a minimal RequestContext so the matched
|
|
# operation's own extractors decide resource and
|
|
# parameters — same path the HTTP endpoints take.
|
|
from ..registry import RequestContext
|
|
ctx = RequestContext(
|
|
body=inner if isinstance(inner, dict) else {},
|
|
match_info={},
|
|
identity=self.identity,
|
|
)
|
|
resource = op.extract_resource(ctx)
|
|
parameters = op.extract_parameters(ctx)
|
|
|
|
await self.auth.authorise(
|
|
self.identity, op.capability, resource, parameters,
|
|
)
|
|
except _web.HTTPNotFound:
|
|
await self.ws.send_json({
|
|
"id": request_id,
|
|
"error": {
|
|
"message": "workspace not found",
|
|
"type": "workspace-not-found",
|
|
},
|
|
"complete": True,
|
|
})
|
|
return
|
|
except _web.HTTPForbidden:
|
|
await self.ws.send_json({
|
|
"id": request_id,
|
|
"error": {
|
|
"message": "access denied",
|
|
"type": "access-denied",
|
|
},
|
|
"complete": True,
|
|
})
|
|
return
|
|
except _web.HTTPUnauthorized:
|
|
await self.ws.send_json({
|
|
"id": request_id,
|
|
"error": {
|
|
"message": "auth failure",
|
|
"type": "auth-required",
|
|
},
|
|
"complete": True,
|
|
})
|
|
return
|
|
|
|
workspace = data["workspace"]
|
|
|
|
# Plumb authenticated caller's handle as ``actor`` so
|
|
# iam-svc handlers (whoami, future actor-scoped checks)
|
|
# know who is calling. Overwrite any caller-supplied
|
|
# value so it can't be spoofed over the WS.
|
|
if (
|
|
service == "iam"
|
|
and isinstance(data.get("request"), dict)
|
|
and self.identity is not None
|
|
):
|
|
data["request"]["actor"] = self.identity.handle
|
|
|
|
await self.q.put((
|
|
data["id"],
|
|
workspace,
|
|
data.get("flow"),
|
|
data["service"],
|
|
data["request"]
|
|
))
|
|
|
|
except Exception as e:
|
|
logger.error(f"Receive exception: {str(e)}", exc_info=True)
|
|
error_resp = {
|
|
"error": {"message": str(e), "type": "error"},
|
|
"complete": True,
|
|
}
|
|
if request_id:
|
|
error_resp["id"] = request_id
|
|
await self.ws.send_json(error_resp)
|
|
|
|
async def maybe_tidy_workers(self, workers):
|
|
|
|
while True:
|
|
|
|
try:
|
|
|
|
await asyncio.wait_for(
|
|
asyncio.shield(workers[0]),
|
|
WORKER_CLOSE_WAIT
|
|
)
|
|
|
|
# worker[0] now stopped
|
|
# FIXME: Delete reference???
|
|
|
|
workers.pop(0)
|
|
|
|
if len(workers) == 0:
|
|
break
|
|
|
|
# Loop iterates to try the next worker
|
|
|
|
except TimeoutError:
|
|
# worker[0] still running, move on
|
|
break
|
|
|
|
async def start_request_task(
|
|
self, ws, id, workspace, flow, svc, request, workers,
|
|
):
|
|
|
|
# Wait for outstanding requests to go below MAX_OUTSTANDING_REQUESTS
|
|
while len(workers) > MAX_OUTSTANDING_REQUESTS:
|
|
|
|
# Fixes deadlock
|
|
# FIXME: Put it in its own loop
|
|
await asyncio.sleep(START_REQUEST_WAIT)
|
|
|
|
await self.maybe_tidy_workers(workers)
|
|
|
|
async def responder(resp, fin):
|
|
await self.ws.send_json({
|
|
"id": id,
|
|
"response": resp,
|
|
"complete": fin,
|
|
})
|
|
|
|
worker = asyncio.create_task(
|
|
self.request_task(
|
|
id, request, responder, workspace, flow, svc,
|
|
)
|
|
)
|
|
|
|
workers.append(worker)
|
|
|
|
async def request_task(
|
|
self, id, request, responder, workspace, flow, svc,
|
|
):
|
|
|
|
try:
|
|
|
|
if flow:
|
|
|
|
await self.dispatcher_manager.invoke_flow_service(
|
|
request, responder, workspace, flow, svc,
|
|
)
|
|
|
|
else:
|
|
|
|
await self.dispatcher_manager.invoke_global_service(
|
|
request, responder, svc, workspace=workspace,
|
|
)
|
|
|
|
except Exception as e:
|
|
await self.ws.send_json({
|
|
"id": id,
|
|
"error": {"message": str(e), "type": "error"},
|
|
"complete": True,
|
|
})
|
|
|
|
async def run(self):
|
|
|
|
# Worker threads, servicing
|
|
workers = []
|
|
|
|
while self.running.get():
|
|
|
|
try:
|
|
|
|
if len(workers) > 0:
|
|
await self.maybe_tidy_workers(workers)
|
|
|
|
# Get next request on queue
|
|
item = await asyncio.wait_for(self.q.get(), 1)
|
|
id, workspace, flow, svc, request = item
|
|
|
|
except TimeoutError:
|
|
continue
|
|
|
|
except Exception as e:
|
|
# This is an internal working error, may not be recoverable
|
|
logger.error(f"Run prepare exception: {e}", exc_info=True)
|
|
await self.ws.send_json({
|
|
"id": id,
|
|
"error": {"message": str(e), "type": "error"},
|
|
"complete": True,
|
|
})
|
|
self.running.stop()
|
|
|
|
if self.ws:
|
|
await self.ws.close()
|
|
self.ws = None
|
|
|
|
break
|
|
|
|
try:
|
|
|
|
await self.start_request_task(
|
|
self.ws, id, workspace, flow, svc, request, workers
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Exception in mux: {e}", exc_info=True)
|
|
await self.ws.send_json({
|
|
"id": id,
|
|
"error": {"message": str(e), "type": "error"},
|
|
"complete": True,
|
|
})
|
|
|
|
self.running.stop()
|
|
|
|
if self.ws:
|
|
await self.ws.close()
|
|
self.ws = None
|
|
|