Per-workspace queue routing for workspace-scoped services (#862)

Workspace identity is now determined by queue infrastructure instead of
message body fields, closing a privilege-escalation vector where a caller
could spoof workspace in the request payload.

- Add WorkspaceProcessor base class: discovers workspaces from config at
  startup, creates per-workspace consumers (queue:workspace), and manages
  consumer lifecycle on workspace create/delete events
- Roll out to librarian, flow-svc, knowledge cores, and config-svc
- Config service gets a dual-queue regime: a system queue for
  cross-workspace ops (getvalues-all-ws, bootstrapper writes to
  __workspaces__) and per-workspace queues for tenant-scoped ops, with
  workspace discovery from its own Cassandra store
- Remove workspace field from request schemas (FlowRequest,
  LibrarianRequest, KnowledgeRequest, CollectionManagementRequest) and
  from DocumentMetadata / ProcessingMetadata — table stores now accept
  workspace as an explicit parameter
- Strip workspace encode/decode from all message translators and gateway
  serializers
- Gateway enforces workspace existence: reject requests targeting
  non-existent workspaces instead of routing to queues with no consumer
- Config service provisions new workspaces from __template__ on creation
- Add workspace lifecycle hooks to AsyncProcessor so any processor can
  react to workspace create/delete without subclassing WorkspaceProcessor
This commit is contained in:
cybermaggedon 2026-05-04 10:30:03 +01:00 committed by GitHub
parent 9be257ceee
commit 9f2bfbce0c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
53 changed files with 1565 additions and 677 deletions

View file

@ -217,7 +217,6 @@ class Library:
"title": title,
"comments": comments,
"metadata": triples,
"workspace": self.api.workspace,
"tags": tags
},
"content": base64.b64encode(document).decode("utf-8"),
@ -249,7 +248,6 @@ class Library:
"kind": kind,
"title": title,
"comments": comments,
"workspace": self.api.workspace,
"tags": tags,
},
"total-size": total_size,
@ -377,7 +375,6 @@ class Library:
)
for w in v["metadata"]
],
workspace = v.get("workspace", ""),
tags = v["tags"],
parent_id = v.get("parent-id", ""),
document_type = v.get("document-type", "source"),
@ -436,7 +433,6 @@ class Library:
)
for w in doc["metadata"]
],
workspace = doc.get("workspace", ""),
tags = doc["tags"],
parent_id = doc.get("parent-id", ""),
document_type = doc.get("document-type", "source"),
@ -485,7 +481,6 @@ class Library:
"operation": "update-document",
"workspace": self.api.workspace,
"document-metadata": {
"workspace": self.api.workspace,
"document-id": id,
"time": metadata.time,
"title": metadata.title,
@ -599,7 +594,6 @@ class Library:
"document-id": document_id,
"time": int(time.time()),
"flow": flow,
"workspace": self.api.workspace,
"collection": collection,
"tags": tags,
}
@ -681,7 +675,6 @@ class Library:
document_id = v["document-id"],
time = datetime.datetime.fromtimestamp(v["time"]),
flow = v["flow"],
workspace = v.get("workspace", ""),
collection = v["collection"],
tags = v["tags"],
)
@ -945,7 +938,6 @@ class Library:
"title": title,
"comments": comments,
"metadata": triples,
"workspace": self.api.workspace,
"tags": tags,
"parent-id": parent_id,
"document-type": "extracted",

View file

@ -65,7 +65,6 @@ class DocumentMetadata:
title: Document title
comments: Additional comments or description
metadata: List of RDF triples providing structured metadata
workspace: Workspace the document belongs to
tags: List of tags for categorization
parent_id: Parent document ID for child documents (empty for top-level docs)
document_type: "source" for uploaded documents, "extracted" for derived content
@ -76,7 +75,6 @@ class DocumentMetadata:
title : str
comments : str
metadata : List[Triple]
workspace : str
tags : List[str]
parent_id : str = ""
document_type : str = "source"
@ -91,7 +89,6 @@ class ProcessingMetadata:
document_id: ID of the document being processed
time: Processing start timestamp
flow: Flow instance handling the processing
workspace: Workspace the processing job belongs to
collection: Target collection for processed data
tags: List of tags for categorization
"""
@ -99,7 +96,6 @@ class ProcessingMetadata:
document_id : str
time : datetime.datetime
flow : str
workspace : str
collection : str
tags : List[str]

View file

@ -7,6 +7,7 @@ from . publisher import Publisher
from . subscriber import Subscriber
from . metrics import ProcessorMetrics, ConsumerMetrics, ProducerMetrics, SubscriberMetrics
from . logging import add_logging_args, setup_logging
from . workspace_processor import WorkspaceProcessor
from . flow_processor import FlowProcessor
from . consumer_spec import ConsumerSpec
from . parameter_spec import ParameterSpec

View file

@ -71,6 +71,11 @@ class AsyncProcessor:
# { "handler": async_fn, "types": set_or_none }
self.config_handlers = []
# Workspace lifecycle handlers, called when workspaces are
# created or deleted. Each entry is an async callable:
# async def handler(workspace_changes: WorkspaceChanges)
self.workspace_handlers = []
# Track the current config version for dedup
self.config_version = 0
@ -251,6 +256,10 @@ class AsyncProcessor:
"types": set(types) if types else None,
})
# Register a handler for workspace lifecycle events
def register_workspace_handler(self, handler: Callable[..., Any]) -> None:
self.workspace_handlers.append(handler)
# Called when a config notify message arrives
async def on_config_notify(self, message, consumer, flow):
@ -266,6 +275,16 @@ class AsyncProcessor:
)
return
# Dispatch workspace lifecycle events before config handlers
if v.workspace_changes and self.workspace_handlers:
for handler in self.workspace_handlers:
try:
await handler(v.workspace_changes)
except Exception as e:
logger.error(
f"Workspace handler failed: {e}", exc_info=True
)
notify_types = set(changes.keys())
# Filter out handlers that don't care about any of the changed

View file

@ -14,7 +14,7 @@ from .. schema import Error
from .. schema import config_request_queue, config_response_queue
from .. schema import config_push_queue
from .. log_level import LogLevel
from . async_processor import AsyncProcessor
from . workspace_processor import WorkspaceProcessor
from . flow import Flow
# Module logger
@ -22,7 +22,7 @@ logger = logging.getLogger(__name__)
# Parent class for configurable processors, configured with flows by
# the config service
class FlowProcessor(AsyncProcessor):
class FlowProcessor(WorkspaceProcessor):
def __init__(self, **params):
@ -113,7 +113,7 @@ class FlowProcessor(AsyncProcessor):
@staticmethod
def add_args(parser: ArgumentParser) -> None:
AsyncProcessor.add_args(parser)
WorkspaceProcessor.add_args(parser)
# parser.add_argument(
# '--rate-limit-retry',

View file

@ -202,7 +202,6 @@ class LibrarianClient:
doc_metadata = DocumentMetadata(
id=doc_id,
workspace=workspace,
kind=kind,
title=title or doc_id,
parent_id=parent_id,
@ -227,7 +226,6 @@ class LibrarianClient:
doc_metadata = DocumentMetadata(
id=doc_id,
workspace=workspace,
kind=kind,
title=title or doc_id,
document_type=document_type,

View file

@ -0,0 +1,66 @@
from __future__ import annotations
from argparse import ArgumentParser
import logging
from . async_processor import AsyncProcessor
logger = logging.getLogger(__name__)
WORKSPACES_NAMESPACE = "__workspaces__"
WORKSPACE_TYPE = "workspace"
class WorkspaceProcessor(AsyncProcessor):
def __init__(self, **params):
super(WorkspaceProcessor, self).__init__(**params)
self.active_workspaces = set()
self.register_workspace_handler(self._handle_workspace_changes)
async def _discover_workspaces(self):
client = self._create_config_client()
try:
await client.start()
type_data, version = await self._fetch_type_all_workspaces(
client, WORKSPACE_TYPE,
)
for ws in type_data:
if ws == WORKSPACES_NAMESPACE:
for workspace_id in type_data[ws]:
if workspace_id not in self.active_workspaces:
self.active_workspaces.add(workspace_id)
await self.on_workspace_created(workspace_id)
finally:
await client.stop()
async def _handle_workspace_changes(self, workspace_changes):
for workspace_id in workspace_changes.created:
if workspace_id not in self.active_workspaces:
self.active_workspaces.add(workspace_id)
logger.info(f"Workspace created: {workspace_id}")
await self.on_workspace_created(workspace_id)
for workspace_id in workspace_changes.deleted:
if workspace_id in self.active_workspaces:
logger.info(f"Workspace deleted: {workspace_id}")
await self.on_workspace_deleted(workspace_id)
self.active_workspaces.discard(workspace_id)
async def on_workspace_created(self, workspace):
pass
async def on_workspace_deleted(self, workspace):
pass
async def start(self):
await super(WorkspaceProcessor, self).start()
await self._discover_workspaces()
@staticmethod
def add_args(parser: ArgumentParser) -> None:
AsyncProcessor.add_args(parser)

View file

@ -9,7 +9,6 @@ class CollectionManagementRequestTranslator(MessageTranslator):
def decode(self, data: Dict[str, Any]) -> CollectionManagementRequest:
return CollectionManagementRequest(
operation=data.get("operation"),
workspace=data.get("workspace", ""),
collection=data.get("collection"),
timestamp=data.get("timestamp"),
name=data.get("name"),
@ -24,8 +23,6 @@ class CollectionManagementRequestTranslator(MessageTranslator):
if obj.operation is not None:
result["operation"] = obj.operation
if obj.workspace:
result["workspace"] = obj.workspace
if obj.collection is not None:
result["collection"] = obj.collection
if obj.timestamp is not None:

View file

@ -9,7 +9,6 @@ class FlowRequestTranslator(MessageTranslator):
def decode(self, data: Dict[str, Any]) -> FlowRequest:
return FlowRequest(
operation=data.get("operation"),
workspace=data.get("workspace", ""),
blueprint_name=data.get("blueprint-name"),
blueprint_definition=data.get("blueprint-definition"),
description=data.get("description"),
@ -22,8 +21,6 @@ class FlowRequestTranslator(MessageTranslator):
if obj.operation is not None:
result["operation"] = obj.operation
if obj.workspace is not None:
result["workspace"] = obj.workspace
if obj.blueprint_name is not None:
result["blueprint-name"] = obj.blueprint_name
if obj.blueprint_definition is not None:

View file

@ -45,7 +45,6 @@ class KnowledgeRequestTranslator(MessageTranslator):
return KnowledgeRequest(
operation=data.get("operation"),
workspace=data.get("workspace", ""),
id=data.get("id"),
flow=data.get("flow"),
collection=data.get("collection"),
@ -58,8 +57,6 @@ class KnowledgeRequestTranslator(MessageTranslator):
if obj.operation:
result["operation"] = obj.operation
if obj.workspace:
result["workspace"] = obj.workspace
if obj.id:
result["id"] = obj.id
if obj.flow:

View file

@ -49,7 +49,6 @@ class LibraryRequestTranslator(MessageTranslator):
document_metadata=doc_metadata,
processing_metadata=proc_metadata,
content=content,
workspace=data.get("workspace", ""),
collection=data.get("collection", ""),
criteria=criteria,
# Chunked upload fields
@ -76,8 +75,6 @@ class LibraryRequestTranslator(MessageTranslator):
result["processing-metadata"] = self.proc_metadata_translator.encode(obj.processing_metadata)
if obj.content:
result["content"] = obj.content.decode("utf-8") if isinstance(obj.content, bytes) else obj.content
if obj.workspace:
result["workspace"] = obj.workspace
if obj.collection:
result["collection"] = obj.collection
if obj.criteria is not None:

View file

@ -19,7 +19,6 @@ class DocumentMetadataTranslator(Translator):
title=data.get("title"),
comments=data.get("comments"),
metadata=self.subgraph_translator.decode(metadata) if metadata is not None else [],
workspace=data.get("workspace"),
tags=data.get("tags"),
parent_id=data.get("parent-id", ""),
document_type=data.get("document-type", "source"),
@ -40,8 +39,6 @@ class DocumentMetadataTranslator(Translator):
result["comments"] = obj.comments
if obj.metadata is not None:
result["metadata"] = self.subgraph_translator.encode(obj.metadata)
if obj.workspace:
result["workspace"] = obj.workspace
if obj.tags is not None:
result["tags"] = obj.tags
if obj.parent_id:
@ -61,7 +58,6 @@ class ProcessingMetadataTranslator(Translator):
document_id=data.get("document-id"),
time=data.get("time"),
flow=data.get("flow"),
workspace=data.get("workspace"),
collection=data.get("collection"),
tags=data.get("tags")
)
@ -77,8 +73,6 @@ class ProcessingMetadataTranslator(Translator):
result["time"] = obj.time
if obj.flow:
result["flow"] = obj.flow
if obj.workspace:
result["workspace"] = obj.workspace
if obj.collection:
result["collection"] = obj.collection
if obj.tags is not None:

View file

@ -8,7 +8,5 @@ class Metadata:
# Root document identifier (set by librarian, preserved through pipeline)
root: str = ""
# Collection the message belongs to. Workspace is NOT carried on the
# message — consumers derive it from flow.workspace (the flow the
# message arrived on), which is the trusted isolation boundary.
# Collection the message belongs to.
collection: str = ""

View file

@ -17,7 +17,7 @@ from .embeddings import GraphEmbeddings
# <- (error)
# list-kg-cores
# -> (workspace)
# -> ()
# <- ()
# <- (error)
@ -27,9 +27,6 @@ class KnowledgeRequest:
# load-kg-core, unload-kg-core
operation: str = ""
# Workspace the cores belong to. Partition / isolation boundary.
workspace: str = ""
# get-kg-core, list-kg-cores, delete-kg-core, put-kg-core,
# load-kg-core, unload-kg-core
id: str = ""

View file

@ -22,17 +22,9 @@ class CollectionMetadata:
@dataclass
class CollectionManagementRequest:
"""Request for collection management operations.
Collection-management is a global (non-flow-scoped) service, so the
workspace has to travel on the wire it's the isolation boundary
for which workspace's collections the request operates on.
"""
"""Request for collection management operations."""
operation: str = "" # e.g., "delete-collection"
# Workspace the collection belongs to.
workspace: str = ""
collection: str = ""
timestamp: str = "" # ISO timestamp
name: str = ""

View file

@ -70,6 +70,11 @@ class ConfigResponse:
# Everything
error: Error | None = None
@dataclass
class WorkspaceChanges:
created: list[str] = field(default_factory=list)
deleted: list[str] = field(default_factory=list)
@dataclass
class ConfigPush:
version: int = 0
@ -80,6 +85,10 @@ class ConfigPush:
# e.g. {"prompt": ["workspace-a", "workspace-b"], "schema": ["workspace-a"]}
changes: dict[str, list[str]] = field(default_factory=dict)
# Workspace lifecycle events. Populated when a workspace entry
# is created or deleted in the __workspaces__ config namespace.
workspace_changes: WorkspaceChanges | None = None
config_request_queue = queue('config', cls='request')
config_response_queue = queue('config', cls='response')
config_push_queue = queue('config', cls='notify')

View file

@ -22,9 +22,6 @@ class FlowRequest:
operation: str = "" # list-blueprints, get-blueprint, put-blueprint, delete-blueprint
# list-flows, get-flow, start-flow, stop-flow
# Workspace scope — all operations act within this workspace
workspace: str = ""
# get_blueprint, put_blueprint, delete_blueprint, start_flow
blueprint_name: str = ""

View file

@ -43,12 +43,12 @@ from ..core.metadata import Metadata
# <- (error)
# list-documents
# -> (workspace, collection?)
# -> (collection?)
# <- (document_metadata[])
# <- (error)
# list-processing
# -> (workspace, collection?)
# -> (collection?)
# <- (processing_metadata[])
# <- (error)
@ -78,7 +78,7 @@ from ..core.metadata import Metadata
# <- (error)
# list-uploads
# -> (workspace)
# -> ()
# <- (uploads[])
# <- (error)
@ -90,7 +90,6 @@ class DocumentMetadata:
title: str = ""
comments: str = ""
metadata: list[Triple] = field(default_factory=list)
workspace: str = ""
tags: list[str] = field(default_factory=list)
# Child document support
parent_id: str = "" # Empty for top-level docs, set for children
@ -107,7 +106,6 @@ class ProcessingMetadata:
document_id: str = ""
time: int = 0
flow: str = ""
workspace: str = ""
collection: str = ""
tags: list[str] = field(default_factory=list)
@ -162,9 +160,6 @@ class LibrarianRequest:
# add-document, upload-chunk
content: bytes = b""
# Workspace scopes every library operation.
workspace: str = ""
# list-documents?, list-processing?
collection: str = ""