RabbitMQ pub/sub backend with topic exchange architecture (#752)

Adds a RabbitMQ backend as an alternative to Pulsar, selectable via
PUBSUB_BACKEND=rabbitmq. Both backends implement the same PubSubBackend
protocol — no application code changes needed to switch.

RabbitMQ topology:
- Single topic exchange per topicspace (e.g. 'tg')
- Routing key derived from queue class and topic name
- Shared consumers: named queue bound to exchange (competing, round-robin)
- Exclusive consumers: anonymous auto-delete queue (broadcast, each gets
  every message). Used by Subscriber and config push consumer.
- Thread-local producer connections (pika is not thread-safe)
- Push-based consumption via basic_consume with process_data_events
  for heartbeat processing

Consumer model changes:
- Consumer class creates one backend consumer per concurrent task
  (required for pika thread safety, harmless for Pulsar)
- Consumer class accepts consumer_type parameter
- Subscriber passes consumer_type='exclusive' for broadcast semantics
- Config push consumer uses consumer_type='exclusive' so every
  processor instance receives config updates
- handle_one_from_queue receives consumer as parameter for correct
  per-connection ack/nack

LibrarianClient:
- New shared client class replacing duplicated librarian request-response
  code across 6+ services (chunking, decoders, RAG, etc.)
- Uses stream-document instead of get-document-content for fetching
  document content in 1MB chunks (avoids broker message size limits)
- Standalone object (self.librarian = LibrarianClient(...)) not a mixin
- get-document-content marked deprecated in schema and OpenAPI spec

Serialisation:
- Extracted dataclass_to_dict/dict_to_dataclass to shared
  serialization.py (used by both Pulsar and RabbitMQ backends)

Librarian queues:
- Changed from flow class (persistent) back to request/response class
  now that stream-document eliminates large single messages
- API upload chunk size reduced from 5MB to 3MB to stay under broker
  limits after base64 encoding

Factory and CLI:
- get_pubsub() handles 'rabbitmq' backend with RabbitMQ connection params
- add_pubsub_args() includes RabbitMQ options (host, port, credentials)
- add_pubsub_args(standalone=True) defaults to localhost for CLI tools
- init_trustgraph skips Pulsar admin setup for non-Pulsar backends
- tg-dump-queues and tg-monitor-prompts use backend abstraction
- BaseClient and ConfigClient accept generic pubsub config
This commit is contained in:
cybermaggedon 2026-04-02 12:47:16 +01:00 committed by GitHub
parent 4fb0b4d8e8
commit 24f0190ce7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
36 changed files with 1277 additions and 1313 deletions

View file

@ -14,22 +14,18 @@ Tables are preserved as HTML markup for better downstream extraction.
Images are stored in the librarian but not sent to the text pipeline.
"""
import asyncio
import base64
import logging
import magic
import tempfile
import os
import uuid
from unstructured.partition.auto import partition
from ... schema import Document, TextDocument, Metadata
from ... schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
from ... schema import librarian_request_queue, librarian_response_queue
from ... schema import Triples
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
from ... base import Consumer, Producer, ConsumerMetrics, ProducerMetrics
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec, LibrarianClient
from ... provenance import (
document_uri, page_uri as make_page_uri,
@ -166,128 +162,16 @@ class Processor(FlowProcessor):
)
)
# Librarian client for fetching/storing document content
librarian_request_q = params.get(
"librarian_request_queue", default_librarian_request_queue
# Librarian client
self.librarian = LibrarianClient(
id=id, backend=self.pubsub, taskgroup=self.taskgroup,
)
librarian_response_q = params.get(
"librarian_response_queue", default_librarian_response_queue
)
librarian_request_metrics = ProducerMetrics(
processor=id, flow=None, name="librarian-request"
)
self.librarian_request_producer = Producer(
backend=self.pubsub,
topic=librarian_request_q,
schema=LibrarianRequest,
metrics=librarian_request_metrics,
)
librarian_response_metrics = ConsumerMetrics(
processor=id, flow=None, name="librarian-response"
)
self.librarian_response_consumer = Consumer(
taskgroup=self.taskgroup,
backend=self.pubsub,
flow=None,
topic=librarian_response_q,
subscriber=f"{id}-librarian",
schema=LibrarianResponse,
handler=self.on_librarian_response,
metrics=librarian_response_metrics,
)
# Pending librarian requests: request_id -> asyncio.Future
self.pending_requests = {}
logger.info("Universal decoder initialized")
async def start(self):
await super(Processor, self).start()
await self.librarian_request_producer.start()
await self.librarian_response_consumer.start()
async def on_librarian_response(self, msg, consumer, flow):
"""Handle responses from the librarian service."""
response = msg.value()
request_id = msg.properties().get("id")
if request_id and request_id in self.pending_requests:
future = self.pending_requests.pop(request_id)
future.set_result(response)
async def _librarian_request(self, request, timeout=120):
"""Send a request to the librarian and wait for response."""
request_id = str(uuid.uuid4())
future = asyncio.get_event_loop().create_future()
self.pending_requests[request_id] = future
try:
await self.librarian_request_producer.send(
request, properties={"id": request_id}
)
response = await asyncio.wait_for(future, timeout=timeout)
if response.error:
raise RuntimeError(
f"Librarian error: {response.error.type}: "
f"{response.error.message}"
)
return response
except asyncio.TimeoutError:
self.pending_requests.pop(request_id, None)
raise RuntimeError("Timeout waiting for librarian response")
async def fetch_document_metadata(self, document_id, user):
"""Fetch document metadata from the librarian."""
request = LibrarianRequest(
operation="get-document-metadata",
document_id=document_id,
user=user,
)
response = await self._librarian_request(request)
return response.document_metadata
async def fetch_document_content(self, document_id, user):
"""Fetch document content from the librarian."""
request = LibrarianRequest(
operation="get-document-content",
document_id=document_id,
user=user,
)
response = await self._librarian_request(request)
return response.content
async def save_child_document(self, doc_id, parent_id, user, content,
document_type="page", title=None,
kind="text/plain"):
"""Save a child document to the librarian."""
if isinstance(content, str):
content = content.encode("utf-8")
doc_metadata = DocumentMetadata(
id=doc_id,
user=user,
kind=kind,
title=title or doc_id,
parent_id=parent_id,
document_type=document_type,
)
request = LibrarianRequest(
operation="add-child-document",
document_metadata=doc_metadata,
content=base64.b64encode(content).decode("utf-8"),
)
await self._librarian_request(request)
return doc_id
await self.librarian.start()
def extract_elements(self, blob, mime_type=None):
"""
@ -388,7 +272,7 @@ class Processor(FlowProcessor):
page_content = text.encode("utf-8")
# Save to librarian
await self.save_child_document(
await self.librarian.save_child_document(
doc_id=doc_id,
parent_id=parent_doc_id,
user=metadata.user,
@ -469,7 +353,7 @@ class Processor(FlowProcessor):
# Save to librarian
if img_content:
await self.save_child_document(
await self.librarian.save_child_document(
doc_id=img_uri,
parent_id=parent_doc_id,
user=metadata.user,
@ -518,13 +402,13 @@ class Processor(FlowProcessor):
f"Fetching document {v.document_id} from librarian..."
)
doc_meta = await self.fetch_document_metadata(
doc_meta = await self.librarian.fetch_document_metadata(
document_id=v.document_id,
user=v.metadata.user,
)
mime_type = doc_meta.kind if doc_meta else None
content = await self.fetch_document_content(
content = await self.librarian.fetch_document_content(
document_id=v.document_id,
user=v.metadata.user,
)