Basic multitenant support (#583)

* Tech spec

* Address multi-tenant queue option problems in CLI

* Modified collection service to use config

* Changed storage management to use the config service definition
This commit is contained in:
cybermaggedon 2025-12-05 21:45:30 +00:00 committed by GitHub
parent 789d9713a0
commit 7d07f802a8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
28 changed files with 1416 additions and 1731 deletions

View file

@ -1,142 +1,130 @@
"""
Collection management for the librarian
Collection management for the librarian - uses config service for storage
"""
import asyncio
import logging
import json
import uuid
from datetime import datetime
from typing import Dict, Any, List, Optional
from .. schema import CollectionManagementRequest, CollectionManagementResponse, Error
from .. schema import CollectionMetadata
from .. schema import StorageManagementRequest, StorageManagementResponse
from .. schema import ConfigRequest, ConfigResponse
from .. exceptions import RequestError
from .. tables.library import LibraryTableStore
# Module logger
logger = logging.getLogger(__name__)
class CollectionManager:
"""Manages collection metadata and coordinates collection operations across storage types"""
"""Manages collection metadata via config service"""
def __init__(
self,
cassandra_host,
cassandra_username,
cassandra_password,
keyspace,
vector_storage_producer=None,
object_storage_producer=None,
triples_storage_producer=None,
storage_response_consumer=None
config_request_producer,
config_response_consumer,
taskgroup
):
"""
Initialize the CollectionManager
Args:
cassandra_host: Cassandra host(s)
cassandra_username: Cassandra username
cassandra_password: Cassandra password
keyspace: Cassandra keyspace for library data
vector_storage_producer: Producer for vector storage management
object_storage_producer: Producer for object storage management
triples_storage_producer: Producer for triples storage management
storage_response_consumer: Consumer for storage management responses
config_request_producer: Producer for config service requests
config_response_consumer: Consumer for config service responses
taskgroup: Task group for async operations
"""
self.table_store = LibraryTableStore(
cassandra_host, cassandra_username, cassandra_password, keyspace
)
self.config_request_producer = config_request_producer
self.config_response_consumer = config_response_consumer
self.taskgroup = taskgroup
# Storage management producers
self.vector_storage_producer = vector_storage_producer
self.object_storage_producer = object_storage_producer
self.triples_storage_producer = triples_storage_producer
self.storage_response_consumer = storage_response_consumer
# Track pending config requests
self.pending_config_requests = {}
# Track pending deletion operations
self.pending_deletions = {}
logger.info("Collection manager initialized with config service backend")
logger.info("Collection manager initialized")
async def send_config_request(self, request: ConfigRequest) -> ConfigResponse:
"""
Send config request and wait for response
Args:
request: Config service request
Returns:
ConfigResponse from config service
"""
event = asyncio.Event()
self.pending_config_requests[request.id] = event
await self.config_request_producer.send(request)
await event.wait()
response = self.pending_config_requests.pop(request.id + "_response")
return response
async def on_config_response(self, message, consumer, flow):
"""
Handle config response
Args:
message: Pulsar message
consumer: Consumer instance
flow: Flow context
"""
response = message.value()
if response.id in self.pending_config_requests:
self.pending_config_requests[response.id + "_response"] = response
self.pending_config_requests[response.id].set()
async def ensure_collection_exists(self, user: str, collection: str):
"""
Ensure a collection exists, creating it if necessary with broadcast to storage
Ensure a collection exists, creating it if necessary
Args:
user: User ID
collection: Collection ID
"""
try:
# Check if collection already exists
existing = await self.table_store.get_collection(user, collection)
if existing:
# Check if collection exists via config service
request = ConfigRequest(
id=str(uuid.uuid4()),
operation='get',
type='collection',
keys=[f'{user}:{collection}']
)
response = await self.send_config_request(request)
# If collection exists, we're done
if response.values and len(response.values) > 0:
logger.debug(f"Collection {user}/{collection} already exists")
return
# Create new collection with default metadata
logger.info(f"Auto-creating collection {user}/{collection} from document submission")
await self.table_store.create_collection(
logger.info(f"Auto-creating collection {user}/{collection}")
metadata = CollectionMetadata(
user=user,
collection=collection,
name=collection, # Default name to collection ID
description="",
tags=set()
tags=[]
)
# Broadcast collection creation to all storage backends
creation_key = (user, collection)
logger.info(f"Broadcasting create-collection for {creation_key}")
self.pending_deletions[creation_key] = {
"responses_pending": 4, # doc-embeddings, graph-embeddings, object, triples
"responses_received": [],
"all_successful": True,
"error_messages": [],
"deletion_complete": asyncio.Event()
}
storage_request = StorageManagementRequest(
operation="create-collection",
user=user,
collection=collection
request = ConfigRequest(
id=str(uuid.uuid4()),
operation='put',
type='collection',
key=f'{user}:{collection}',
value=json.dumps(metadata.to_dict())
)
# Send creation requests to all storage types
if self.vector_storage_producer:
await self.vector_storage_producer.send(storage_request)
if self.object_storage_producer:
await self.object_storage_producer.send(storage_request)
if self.triples_storage_producer:
await self.triples_storage_producer.send(storage_request)
response = await self.send_config_request(request)
# Wait for all storage creations to complete (with timeout)
creation_info = self.pending_deletions[creation_key]
try:
await asyncio.wait_for(
creation_info["deletion_complete"].wait(),
timeout=30.0 # 30 second timeout
)
except asyncio.TimeoutError:
logger.error(f"Timeout waiting for storage creation responses for {creation_key}")
creation_info["all_successful"] = False
creation_info["error_messages"].append("Timeout waiting for storage creation")
if response.error:
raise RuntimeError(f"Config update failed: {response.error.message}")
# Check if all creations succeeded
if not creation_info["all_successful"]:
error_msg = f"Storage creation failed: {'; '.join(creation_info['error_messages'])}"
logger.error(error_msg)
# Clean up metadata on failure
await self.table_store.delete_collection(user, collection)
# Clean up tracking
del self.pending_deletions[creation_key]
raise RuntimeError(error_msg)
# Clean up tracking
del self.pending_deletions[creation_key]
logger.info(f"Collection {creation_key} auto-created successfully in all storage backends")
logger.info(f"Collection {user}/{collection} auto-created in config service")
except Exception as e:
logger.error(f"Error ensuring collection exists: {e}")
@ -144,7 +132,7 @@ class CollectionManager:
async def list_collections(self, request: CollectionManagementRequest) -> CollectionManagementResponse:
"""
List collections for a user with optional tag filtering
List collections for a user from config service
Args:
request: Collection management request
@ -153,25 +141,43 @@ class CollectionManager:
CollectionManagementResponse with list of collections
"""
try:
tag_filter = list(request.tag_filter) if request.tag_filter else None
collections = await self.table_store.list_collections(request.user, tag_filter)
# Get all collections from config service
config_request = ConfigRequest(
id=str(uuid.uuid4()),
operation='getvalues',
type='collection'
)
collection_metadata = [
CollectionMetadata(
user=coll["user"],
collection=coll["collection"],
name=coll["name"],
description=coll["description"],
tags=coll["tags"],
created_at=coll["created_at"],
updated_at=coll["updated_at"]
)
for coll in collections
]
response = await self.send_config_request(config_request)
if response.error:
raise RuntimeError(f"Config query failed: {response.error.message}")
# Parse collections and filter by user
collections = []
for key, value_json in response.values.items():
if ":" in key:
coll_user, coll_name = key.split(":", 1)
if coll_user == request.user:
metadata_dict = json.loads(value_json)
metadata = CollectionMetadata(**metadata_dict)
collections.append(metadata)
# Apply tag filtering if specified
if request.tag_filter:
tag_filter_set = set(request.tag_filter)
collections = [
c for c in collections
if any(tag in tag_filter_set for tag in c.tags)
]
# Apply limit if specified
if request.limit and request.limit > 0:
collections = collections[:request.limit]
return CollectionManagementResponse(
error=None,
collections=collection_metadata,
collections=collections,
timestamp=datetime.now().isoformat()
)
@ -181,7 +187,7 @@ class CollectionManager:
async def update_collection(self, request: CollectionManagementRequest) -> CollectionManagementResponse:
"""
Update collection metadata (creates if doesn't exist)
Update collection metadata via config service (creates if doesn't exist)
Args:
request: Collection management request
@ -190,120 +196,41 @@ class CollectionManager:
CollectionManagementResponse with updated collection
"""
try:
# Check if collection exists, create if it doesn't
existing = await self.table_store.get_collection(request.user, request.collection)
if not existing:
# Create new collection with provided metadata
logger.info(f"Creating new collection {request.user}/{request.collection}")
# Create metadata from request
name = request.name if request.name else request.collection
description = request.description if request.description else ""
tags = list(request.tags) if request.tags else []
name = request.name if request.name else request.collection
description = request.description if request.description else ""
tags = set(request.tags) if request.tags else set()
metadata = CollectionMetadata(
user=request.user,
collection=request.collection,
name=name,
description=description,
tags=tags
)
await self.table_store.create_collection(
user=request.user,
collection=request.collection,
name=name,
description=description,
tags=tags
)
# Send put request to config service
config_request = ConfigRequest(
id=str(uuid.uuid4()),
operation='put',
type='collection',
key=f'{request.user}:{request.collection}',
value=json.dumps(metadata.to_dict())
)
# Broadcast collection creation to all storage backends
creation_key = (request.user, request.collection)
logger.info(f"Broadcasting create-collection for {creation_key}")
response = await self.send_config_request(config_request)
self.pending_deletions[creation_key] = {
"responses_pending": 4, # doc-embeddings, graph-embeddings, object, triples
"responses_received": [],
"all_successful": True,
"error_messages": [],
"deletion_complete": asyncio.Event()
}
if response.error:
raise RuntimeError(f"Config update failed: {response.error.message}")
storage_request = StorageManagementRequest(
operation="create-collection",
user=request.user,
collection=request.collection
)
logger.info(f"Collection {request.user}/{request.collection} updated in config service")
# Send creation requests to all storage types
if self.vector_storage_producer:
await self.vector_storage_producer.send(storage_request)
if self.object_storage_producer:
await self.object_storage_producer.send(storage_request)
if self.triples_storage_producer:
await self.triples_storage_producer.send(storage_request)
# Wait for all storage creations to complete (with timeout)
creation_info = self.pending_deletions[creation_key]
try:
await asyncio.wait_for(
creation_info["deletion_complete"].wait(),
timeout=30.0 # 30 second timeout
)
except asyncio.TimeoutError:
logger.error(f"Timeout waiting for storage creation responses for {creation_key}")
creation_info["all_successful"] = False
creation_info["error_messages"].append("Timeout waiting for storage creation")
# Check if all creations succeeded
if not creation_info["all_successful"]:
error_msg = f"Storage creation failed: {'; '.join(creation_info['error_messages'])}"
logger.error(error_msg)
# Clean up metadata on failure
await self.table_store.delete_collection(request.user, request.collection)
# Clean up tracking
del self.pending_deletions[creation_key]
return CollectionManagementResponse(
error=Error(
type="storage_creation_error",
message=error_msg
),
timestamp=datetime.now().isoformat()
)
# Clean up tracking
del self.pending_deletions[creation_key]
logger.info(f"Collection {creation_key} created successfully in all storage backends")
# Get the newly created collection for response
created_collection = await self.table_store.get_collection(request.user, request.collection)
collection_metadata = CollectionMetadata(
user=created_collection["user"],
collection=created_collection["collection"],
name=created_collection["name"],
description=created_collection["description"],
tags=created_collection["tags"],
created_at=created_collection["created_at"],
updated_at=created_collection["updated_at"]
)
else:
# Collection exists, update it
name = request.name if request.name else None
description = request.description if request.description else None
tags = list(request.tags) if request.tags else None
updated_collection = await self.table_store.update_collection(
request.user, request.collection, name, description, tags
)
collection_metadata = CollectionMetadata(
user=updated_collection["user"],
collection=updated_collection["collection"],
name=updated_collection["name"],
description=updated_collection["description"],
tags=updated_collection["tags"],
created_at="", # Not returned by update
updated_at=updated_collection["updated_at"]
)
# Config service will trigger config push automatically
# Storage services will receive update and create/update collections
return CollectionManagementResponse(
error=None,
collections=[collection_metadata],
collections=[metadata],
timestamp=datetime.now().isoformat()
)
@ -313,7 +240,7 @@ class CollectionManager:
async def delete_collection(self, request: CollectionManagementRequest) -> CollectionManagementResponse:
"""
Delete collection with cascade to all storage types
Delete collection via config service
Args:
request: Collection management request
@ -322,68 +249,25 @@ class CollectionManager:
CollectionManagementResponse indicating success or failure
"""
try:
deletion_key = (request.user, request.collection)
logger.info(f"Deleting collection {request.user}/{request.collection}")
logger.info(f"Starting cascade deletion for {request.user}/{request.collection}")
# Track this deletion request
self.pending_deletions[deletion_key] = {
"responses_pending": 4, # doc-embeddings, graph-embeddings, object, triples
"responses_received": [],
"all_successful": True,
"error_messages": [],
"deletion_complete": asyncio.Event()
}
# Create storage management request
storage_request = StorageManagementRequest(
operation="delete-collection",
user=request.user,
collection=request.collection
# Send delete request to config service
config_request = ConfigRequest(
id=str(uuid.uuid4()),
operation='delete',
type='collection',
key=f'{request.user}:{request.collection}'
)
# Send deletion requests to all storage types
if self.vector_storage_producer:
await self.vector_storage_producer.send(storage_request)
if self.object_storage_producer:
await self.object_storage_producer.send(storage_request)
if self.triples_storage_producer:
await self.triples_storage_producer.send(storage_request)
response = await self.send_config_request(config_request)
# Wait for all storage deletions to complete (with timeout)
deletion_info = self.pending_deletions[deletion_key]
try:
await asyncio.wait_for(
deletion_info["deletion_complete"].wait(),
timeout=30.0 # 30 second timeout
)
except asyncio.TimeoutError:
logger.error(f"Timeout waiting for storage deletion responses for {deletion_key}")
deletion_info["all_successful"] = False
deletion_info["error_messages"].append("Timeout waiting for storage deletion")
if response.error:
raise RuntimeError(f"Config delete failed: {response.error.message}")
# Check if all deletions succeeded
if not deletion_info["all_successful"]:
error_msg = f"Storage deletion failed: {'; '.join(deletion_info['error_messages'])}"
logger.error(error_msg)
logger.info(f"Collection {request.user}/{request.collection} deleted from config service")
# Clean up tracking
del self.pending_deletions[deletion_key]
return CollectionManagementResponse(
error=Error(
type="storage_deletion_error",
message=error_msg
),
timestamp=datetime.now().isoformat()
)
# All storage deletions succeeded, now delete metadata
logger.info(f"Storage deletions complete, removing metadata for {deletion_key}")
await self.table_store.delete_collection(request.user, request.collection)
# Clean up tracking
del self.pending_deletions[deletion_key]
# Config service will trigger config push automatically
# Storage services will receive update and delete collections
return CollectionManagementResponse(
error=None,
@ -392,39 +276,4 @@ class CollectionManager:
except Exception as e:
logger.error(f"Error deleting collection: {e}")
# Clean up tracking on error
if deletion_key in self.pending_deletions:
del self.pending_deletions[deletion_key]
raise RequestError(f"Failed to delete collection: {str(e)}")
async def on_storage_response(self, response: StorageManagementResponse):
"""
Handle storage management responses for deletion tracking
Args:
response: Storage management response
"""
logger.debug(f"Received storage response: error={response.error}")
# Find matching deletion by checking all pending deletions
# Note: This is simplified correlation - in production we'd want better correlation
for deletion_key, info in list(self.pending_deletions.items()):
if info["responses_pending"] > 0:
# Record this response
info["responses_received"].append(response)
info["responses_pending"] -= 1
# Check if this response indicates failure
if response.error and response.error.message:
info["all_successful"] = False
info["error_messages"].append(response.error.message)
logger.warning(f"Storage operation failed for {deletion_key}: {response.error.message}")
else:
logger.debug(f"Storage operation succeeded for {deletion_key}")
# If all responses received, signal completion
if info["responses_pending"] == 0:
logger.info(f"All storage responses received for {deletion_key}")
info["deletion_complete"].set()
break # Only process for first matching deletion

View file

@ -18,9 +18,8 @@ from .. schema import LibrarianRequest, LibrarianResponse, Error
from .. schema import librarian_request_queue, librarian_response_queue
from .. schema import CollectionManagementRequest, CollectionManagementResponse
from .. schema import collection_request_queue, collection_response_queue
from .. schema import StorageManagementRequest, StorageManagementResponse
from .. schema import vector_storage_management_topic, object_storage_management_topic
from .. schema import triples_storage_management_topic, storage_management_response_topic
from .. schema import ConfigRequest, ConfigResponse
from .. schema import config_request_queue, config_response_queue
from .. schema import Document, Metadata
from .. schema import TextDocument, Metadata
@ -39,6 +38,8 @@ default_librarian_request_queue = librarian_request_queue
default_librarian_response_queue = librarian_response_queue
default_collection_request_queue = collection_request_queue
default_collection_response_queue = collection_response_queue
default_config_request_queue = config_request_queue
default_config_response_queue = config_response_queue
default_minio_host = "minio:9000"
default_minio_access_key = "minioadmin"
@ -47,9 +48,6 @@ default_cassandra_host = "cassandra"
bucket_name = "library"
# FIXME: How to ensure this doesn't conflict with other usage?
keyspace = "librarian"
class Processor(AsyncProcessor):
def __init__(self, **params):
@ -74,6 +72,14 @@ class Processor(AsyncProcessor):
"collection_response_queue", default_collection_response_queue
)
config_request_queue = params.get(
"config_request_queue", default_config_request_queue
)
config_response_queue = params.get(
"config_response_queue", default_config_response_queue
)
minio_host = params.get("minio_host", default_minio_host)
minio_access_key = params.get(
"minio_access_key",
@ -87,14 +93,15 @@ class Processor(AsyncProcessor):
cassandra_host = params.get("cassandra_host")
cassandra_username = params.get("cassandra_username")
cassandra_password = params.get("cassandra_password")
# Resolve configuration with environment variable fallback
hosts, username, password = resolve_cassandra_config(
hosts, username, password, keyspace = resolve_cassandra_config(
host=cassandra_host,
username=cassandra_username,
password=cassandra_password
password=cassandra_password,
default_keyspace="librarian"
)
# Store resolved configuration
self.cassandra_host = hosts
self.cassandra_username = username
@ -170,34 +177,31 @@ class Processor(AsyncProcessor):
metrics = collection_response_metrics,
)
# Storage management producers for collection deletion
self.vector_storage_producer = Producer(
client = self.pulsar_client,
topic = vector_storage_management_topic,
schema = StorageManagementRequest,
# Config service client for collection management
config_request_metrics = ProducerMetrics(
processor = id, flow = None, name = "config-request"
)
self.object_storage_producer = Producer(
self.config_request_producer = Producer(
client = self.pulsar_client,
topic = object_storage_management_topic,
schema = StorageManagementRequest,
topic = config_request_queue,
schema = ConfigRequest,
metrics = config_request_metrics,
)
self.triples_storage_producer = Producer(
client = self.pulsar_client,
topic = triples_storage_management_topic,
schema = StorageManagementRequest,
config_response_metrics = ConsumerMetrics(
processor = id, flow = None, name = "config-response"
)
self.storage_response_consumer = Consumer(
self.config_response_consumer = Consumer(
taskgroup = self.taskgroup,
client = self.pulsar_client,
flow = None,
topic = storage_management_response_topic,
subscriber = id,
schema = StorageManagementResponse,
handler = self.on_storage_response,
metrics = storage_response_metrics,
topic = config_response_queue,
subscriber = f"{id}-config",
schema = ConfigResponse,
handler = self.on_config_response,
metrics = config_response_metrics,
)
self.librarian = Librarian(
@ -213,14 +217,9 @@ class Processor(AsyncProcessor):
)
self.collection_manager = CollectionManager(
cassandra_host = self.cassandra_host,
cassandra_username = self.cassandra_username,
cassandra_password = self.cassandra_password,
keyspace = keyspace,
vector_storage_producer = self.vector_storage_producer,
object_storage_producer = self.object_storage_producer,
triples_storage_producer = self.triples_storage_producer,
storage_response_consumer = self.storage_response_consumer,
config_request_producer = self.config_request_producer,
config_response_consumer = self.config_response_consumer,
taskgroup = self.taskgroup,
)
self.register_config_handler(self.on_librarian_config)
@ -236,10 +235,12 @@ class Processor(AsyncProcessor):
await self.librarian_response_producer.start()
await self.collection_request_consumer.start()
await self.collection_response_producer.start()
await self.vector_storage_producer.start()
await self.object_storage_producer.start()
await self.triples_storage_producer.start()
await self.storage_response_consumer.start()
await self.config_request_producer.start()
await self.config_response_consumer.start()
async def on_config_response(self, message, consumer, flow):
"""Forward config responses to collection manager"""
await self.collection_manager.on_config_response(message, consumer, flow)
async def on_librarian_config(self, config, version):
@ -464,14 +465,6 @@ class Processor(AsyncProcessor):
logger.debug("Collection request processing complete")
async def on_storage_response(self, msg, consumer, flow):
"""
Handle storage management response messages
"""
v = msg.value()
logger.debug("Received storage management response")
await self.collection_manager.on_storage_response(v)
@staticmethod
def add_args(parser):