Per-flow librarian clients and per-workspace response queues (#865)

Replace singleton LibrarianClient with per-flow instances via the new LibrarianSpec, giving each flow its own librarian tied to the workspace-scoped request/response queues from the blueprint. Move all workspace-scoped services (config, flow, librarian, knowledge) from a single base-queue response producer to per-workspace response producers created alongside the existing per-workspace request consumers. Update the gateway dispatcher and bootstrapper flow client to subscribe to the matching workspace-scoped response queues. Fix WorkspaceInit to register workspaces through the IAM create-workspace API so they appear in __workspaces__ and are visible to the gateway. Simplify the bootstrapper gate to only check config-svc reachability. Updated tests accordingly.
2026-05-12 16:52:37 +02:00 · 2026-05-06 12:01:01 +01:00 · 2026-05-06 12:01:01 +01:00 · 03cc5ac80f
commit 03cc5ac80f
parent 01bf1d89d5
30 changed files with 405 additions and 735 deletions
--- a/trustgraph-base/trustgraph/base/init.py
+++ b/trustgraph-base/trustgraph/base/init.py
@ -16,6 +16,7 @@ from . subscriber_spec import SubscriberSpec
 from . request_response_spec import RequestResponseSpec
 from . llm_service import LlmService, LlmResult, LlmChunk
 from . librarian_client import LibrarianClient
+from . librarian_spec import LibrarianSpec
 from . chunking_service import ChunkingService
 from . embeddings_service import EmbeddingsService
 from . embeddings_client import EmbeddingsClientSpec
--- a/trustgraph-base/trustgraph/base/chunking_service.py
+++ b/trustgraph-base/trustgraph/base/chunking_service.py
@ -4,13 +4,11 @@ for chunk-size and chunk-overlap parameters, and librarian client for
 fetching large document content.
 """

-import asyncio
-import base64
 import logging

 from .flow_processor import FlowProcessor
 from .parameter_spec import ParameterSpec
-from .librarian_client import LibrarianClient
+from .librarian_spec import LibrarianSpec

 # Module logger
 logger = logging.getLogger(__name__)
@ -35,35 +33,27 @@ class ChunkingService(FlowProcessor):
            ParameterSpec(name="chunk-overlap")
        )

-        # Librarian client
-        self.librarian = LibrarianClient(
-            id=id,
-            backend=self.pubsub,
-            taskgroup=self.taskgroup,
+        self.register_specification(
+            LibrarianSpec()
        )

        logger.debug("ChunkingService initialized with parameter specifications")

-    async def start(self):
-        await super(ChunkingService, self).start()
-        await self.librarian.start()
-
-    async def get_document_text(self, doc, workspace):
+    async def get_document_text(self, doc, flow):
        """
        Get text content from a TextDocument, fetching from librarian if needed.

        Args:
            doc: TextDocument with either inline text or document_id
-            workspace: Workspace for librarian lookup (from flow.workspace)
+            flow: Flow object with librarian client

        Returns:
            str: The document text content
        """
        if doc.document_id and not doc.text:
            logger.info(f"Fetching document {doc.document_id} from librarian...")
-            text = await self.librarian.fetch_document_text(
+            text = await flow.librarian.fetch_document_text(
                document_id=doc.document_id,
-                workspace=workspace,
            )
            logger.info(f"Fetched {len(text)} characters from librarian")
            return text
--- a/trustgraph-base/trustgraph/base/flow.py
+++ b/trustgraph-base/trustgraph/base/flow.py
@ -1,6 +1,4 @@

-import asyncio
-
 class Flow:
    """
    Runtime representation of a deployed flow process.
@ -22,16 +20,22 @@ class Flow:

        self.parameter = {}

+        self.librarian = None
+
        for spec in processor.specifications:
            spec.add(self, processor, defn)

    async def start(self):
+        if self.librarian:
+            await self.librarian.start()
        for c in self.consumer.values():
            await c.start()

    async def stop(self):
        for c in self.consumer.values():
            await c.stop()
+        if self.librarian:
+            await self.librarian.stop()

    def __call__(self, key):
        if key in self.producer: return self.producer[key]
--- a/trustgraph-base/trustgraph/base/librarian_client.py
+++ b/trustgraph-base/trustgraph/base/librarian_client.py
@ -10,7 +10,7 @@ Usage:
        id=id, backend=self.pubsub, taskgroup=self.taskgroup, **params
    )
    await self.librarian.start()
-    content = await self.librarian.fetch_document_content(doc_id, workspace)
+    content = await self.librarian.fetch_document_content(doc_id)
 """

 import asyncio
@ -39,9 +39,14 @@ class LibrarianClient:
        librarian_response_q = params.get(
            "librarian_response_queue", librarian_response_queue,
        )
+        subscriber = params.get(
+            "librarian_subscriber", f"{id}-librarian",
+        )
+
+        flow_name = params.get("flow_name")

        librarian_request_metrics = ProducerMetrics(
-            processor=id, flow=None, name="librarian-request",
+            processor=id, flow=flow_name, name="librarian-request",
        )

        self._producer = Producer(
@ -52,7 +57,7 @@ class LibrarianClient:
        )

        librarian_response_metrics = ConsumerMetrics(
-            processor=id, flow=None, name="librarian-response",
+            processor=id, flow=flow_name, name="librarian-response",
        )

        self._consumer = Consumer(
@ -60,7 +65,7 @@ class LibrarianClient:
            backend=backend,
            flow=None,
            topic=librarian_response_q,
-            subscriber=f"{id}-librarian",
+            subscriber=subscriber,
            schema=LibrarianResponse,
            handler=self._on_response,
            metrics=librarian_response_metrics,
@ -76,6 +81,11 @@ class LibrarianClient:
        await self._producer.start()
        await self._consumer.start()

+    async def stop(self):
+        """Stop the librarian producer and consumer."""
+        await self._consumer.stop()
+        await self._producer.stop()
+
    async def _on_response(self, msg, consumer, flow):
        """Route librarian responses to the right waiter."""
        response = msg.value()
@ -150,7 +160,7 @@ class LibrarianClient:
        finally:
            self._streams.pop(request_id, None)

-    async def fetch_document_content(self, document_id, workspace, timeout=120):
+    async def fetch_document_content(self, document_id, timeout=120):
        """Fetch document content using streaming.

        Returns base64-encoded content. Caller is responsible for decoding.
@ -158,7 +168,6 @@ class LibrarianClient:
        req = LibrarianRequest(
            operation="stream-document",
            document_id=document_id,
-            workspace=workspace,
        )
        chunks = await self.stream(req, timeout=timeout)

@ -176,24 +185,23 @@ class LibrarianClient:

        return base64.b64encode(raw)

-    async def fetch_document_text(self, document_id, workspace, timeout=120):
+    async def fetch_document_text(self, document_id, timeout=120):
        """Fetch document content and decode as UTF-8 text."""
        content = await self.fetch_document_content(
-            document_id, workspace, timeout=timeout,
+            document_id, timeout=timeout,
        )
        return base64.b64decode(content).decode("utf-8")

-    async def fetch_document_metadata(self, document_id, workspace, timeout=120):
+    async def fetch_document_metadata(self, document_id, timeout=120):
        """Fetch document metadata from the librarian."""
        req = LibrarianRequest(
            operation="get-document-metadata",
            document_id=document_id,
-            workspace=workspace,
        )
        response = await self.request(req, timeout=timeout)
        return response.document_metadata

-    async def save_child_document(self, doc_id, parent_id, workspace, content,
+    async def save_child_document(self, doc_id, parent_id, content,
                                  document_type="chunk", title=None,
                                  kind="text/plain", timeout=120):
        """Save a child document to the librarian."""
@ -217,7 +225,7 @@ class LibrarianClient:
        await self.request(req, timeout=timeout)
        return doc_id

-    async def save_document(self, doc_id, workspace, content, title=None,
+    async def save_document(self, doc_id, content, title=None,
                            document_type="answer", kind="text/plain",
                            timeout=120):
        """Save a document to the librarian."""
@ -236,7 +244,6 @@ class LibrarianClient:
            document_id=doc_id,
            document_metadata=doc_metadata,
            content=base64.b64encode(content).decode("utf-8"),
-            workspace=workspace,
        )

        await self.request(req, timeout=timeout)
--- a/trustgraph-base/trustgraph/base/librarian_spec.py
+++ b/trustgraph-base/trustgraph/base/librarian_spec.py
@ -0,0 +1,31 @@
+from __future__ import annotations
+
+import uuid
+from typing import Any
+
+from . spec import Spec
+from . librarian_client import LibrarianClient
+
+
+class LibrarianSpec(Spec):
+    def __init__(self, request_name="librarian-request",
+                 response_name="librarian-response"):
+        self.request_name = request_name
+        self.response_name = response_name
+
+    def add(self, flow: Any, processor: Any, definition: dict[str, Any]) -> None:
+
+        client = LibrarianClient(
+            id=flow.id,
+            backend=processor.pubsub,
+            taskgroup=processor.taskgroup,
+            librarian_request_queue=definition["topics"][self.request_name],
+            librarian_response_queue=definition["topics"][self.response_name],
+            librarian_subscriber=(
+                processor.id + "--" + flow.workspace + "--" +
+                flow.name + "--librarian--" + str(uuid.uuid4())
+            ),
+            flow_name=flow.name,
+        )
+
+        flow.librarian = client