Collection management part 2 (#522)

* Plumb collection manager into librarian

* Test end-to-end
This commit is contained in:
cybermaggedon 2025-09-19 16:08:47 +01:00 committed by GitHub
parent d378db9370
commit fcd15d1833
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 617 additions and 434 deletions

View file

@ -31,9 +31,9 @@ class KnowledgeGraph:
self.table = "triples" # Legacy single table
else:
# New optimized tables
self.subject_table = "triples_by_subject"
self.po_table = "triples_by_po"
self.object_table = "triples_by_object"
self.subject_table = "triples_s"
self.po_table = "triples_p"
self.object_table = "triples_o"
if username and password:
ssl_context = SSLContext(PROTOCOL_TLSv1_2)
@ -157,7 +157,7 @@ class KnowledgeGraph:
# Query statements for optimized access
self.get_all_stmt = self.session.prepare(
f"SELECT s, p, o FROM {self.subject_table} WHERE collection = ? LIMIT ?"
f"SELECT s, p, o FROM {self.subject_table} WHERE collection = ? LIMIT ? ALLOW FILTERING"
)
self.get_s_stmt = self.session.prepare(
@ -182,7 +182,7 @@ class KnowledgeGraph:
)
self.get_os_stmt = self.session.prepare(
f"SELECT p FROM {self.subject_table} WHERE collection = ? AND s = ? AND o = ? LIMIT ?"
f"SELECT p FROM {self.object_table} WHERE collection = ? AND o = ? AND s = ? LIMIT ?"
)
self.get_spo_stmt = self.session.prepare(

View file

@ -22,7 +22,9 @@ class CollectionManagementRequestor(ServiceRequestor):
self.response_translator = TranslatorRegistry.get_response_translator("collection-management")
def to_request(self, body):
print("REQUEST", body, flush=True)
return self.request_translator.to_pulsar(body)
def from_response(self, message):
return self.response_translator.from_response_with_completion(message)
print("RESPONSE", message, flush=True)
return self.response_translator.from_response_with_completion(message)

View file

@ -0,0 +1,315 @@
"""
Collection management for the librarian
"""
import asyncio
import logging
from datetime import datetime
from typing import Dict, Any, List, Optional
from .. schema import CollectionManagementRequest, CollectionManagementResponse, Error
from .. schema import CollectionMetadata
from .. schema import StorageManagementRequest, StorageManagementResponse
from .. exceptions import RequestError
from .. tables.library import LibraryTableStore
# Module logger
logger = logging.getLogger(__name__)
class CollectionManager:
"""Manages collection metadata and coordinates collection operations across storage types"""
def __init__(
self,
cassandra_host,
cassandra_username,
cassandra_password,
keyspace,
vector_storage_producer=None,
object_storage_producer=None,
triples_storage_producer=None,
storage_response_consumer=None
):
"""
Initialize the CollectionManager
Args:
cassandra_host: Cassandra host(s)
cassandra_username: Cassandra username
cassandra_password: Cassandra password
keyspace: Cassandra keyspace for library data
vector_storage_producer: Producer for vector storage management
object_storage_producer: Producer for object storage management
triples_storage_producer: Producer for triples storage management
storage_response_consumer: Consumer for storage management responses
"""
self.table_store = LibraryTableStore(
cassandra_host, cassandra_username, cassandra_password, keyspace
)
# Storage management producers
self.vector_storage_producer = vector_storage_producer
self.object_storage_producer = object_storage_producer
self.triples_storage_producer = triples_storage_producer
self.storage_response_consumer = storage_response_consumer
# Track pending deletion operations
self.pending_deletions = {}
logger.info("Collection manager initialized")
async def ensure_collection_exists(self, user: str, collection: str):
"""
Ensure a collection exists, creating it if necessary (lazy creation)
Args:
user: User ID
collection: Collection ID
"""
try:
# Check if collection already exists
existing = await self.table_store.get_collection(user, collection)
if existing:
logger.debug(f"Collection {user}/{collection} already exists")
return
# Create new collection with default metadata
logger.info(f"Creating new collection {user}/{collection}")
await self.table_store.create_collection(
user=user,
collection=collection,
name=collection, # Default name to collection ID
description="",
tags=set()
)
except Exception as e:
logger.error(f"Error ensuring collection exists: {e}")
# Don't fail the operation if collection creation fails
# This maintains backward compatibility
async def list_collections(self, request: CollectionManagementRequest) -> CollectionManagementResponse:
"""
List collections for a user with optional tag filtering
Args:
request: Collection management request
Returns:
CollectionManagementResponse with list of collections
"""
try:
tag_filter = list(request.tag_filter) if request.tag_filter else None
collections = await self.table_store.list_collections(request.user, tag_filter)
collection_metadata = [
CollectionMetadata(
user=coll["user"],
collection=coll["collection"],
name=coll["name"],
description=coll["description"],
tags=coll["tags"],
created_at=coll["created_at"],
updated_at=coll["updated_at"]
)
for coll in collections
]
return CollectionManagementResponse(
error=None,
collections=collection_metadata,
timestamp=datetime.now().isoformat()
)
except Exception as e:
logger.error(f"Error listing collections: {e}")
raise RequestError(f"Failed to list collections: {str(e)}")
async def update_collection(self, request: CollectionManagementRequest) -> CollectionManagementResponse:
"""
Update collection metadata (creates if doesn't exist)
Args:
request: Collection management request
Returns:
CollectionManagementResponse with updated collection
"""
try:
# Check if collection exists, create if it doesn't
existing = await self.table_store.get_collection(request.user, request.collection)
if not existing:
# Create new collection with provided metadata
logger.info(f"Creating new collection {request.user}/{request.collection}")
name = request.name if request.name else request.collection
description = request.description if request.description else ""
tags = set(request.tags) if request.tags else set()
await self.table_store.create_collection(
user=request.user,
collection=request.collection,
name=name,
description=description,
tags=tags
)
# Get the newly created collection for response
created_collection = await self.table_store.get_collection(request.user, request.collection)
collection_metadata = CollectionMetadata(
user=created_collection["user"],
collection=created_collection["collection"],
name=created_collection["name"],
description=created_collection["description"],
tags=created_collection["tags"],
created_at=created_collection["created_at"],
updated_at=created_collection["updated_at"]
)
else:
# Collection exists, update it
name = request.name if request.name else None
description = request.description if request.description else None
tags = list(request.tags) if request.tags else None
updated_collection = await self.table_store.update_collection(
request.user, request.collection, name, description, tags
)
collection_metadata = CollectionMetadata(
user=updated_collection["user"],
collection=updated_collection["collection"],
name=updated_collection["name"],
description=updated_collection["description"],
tags=updated_collection["tags"],
created_at="", # Not returned by update
updated_at=updated_collection["updated_at"]
)
return CollectionManagementResponse(
error=None,
collections=[collection_metadata],
timestamp=datetime.now().isoformat()
)
except Exception as e:
logger.error(f"Error updating collection: {e}")
raise RequestError(f"Failed to update collection: {str(e)}")
async def delete_collection(self, request: CollectionManagementRequest) -> CollectionManagementResponse:
"""
Delete collection with cascade to all storage types
Args:
request: Collection management request
Returns:
CollectionManagementResponse indicating success or failure
"""
try:
deletion_key = (request.user, request.collection)
logger.info(f"Starting cascade deletion for {request.user}/{request.collection}")
# Track this deletion request
self.pending_deletions[deletion_key] = {
"responses_pending": 3, # vector, object, triples
"responses_received": [],
"all_successful": True,
"error_messages": [],
"deletion_complete": asyncio.Event()
}
# Create storage management request
storage_request = StorageManagementRequest(
operation="delete-collection",
user=request.user,
collection=request.collection
)
# Send deletion requests to all storage types
if self.vector_storage_producer:
await self.vector_storage_producer.send(storage_request)
if self.object_storage_producer:
await self.object_storage_producer.send(storage_request)
if self.triples_storage_producer:
await self.triples_storage_producer.send(storage_request)
# Wait for all storage deletions to complete (with timeout)
deletion_info = self.pending_deletions[deletion_key]
try:
await asyncio.wait_for(
deletion_info["deletion_complete"].wait(),
timeout=30.0 # 30 second timeout
)
except asyncio.TimeoutError:
logger.error(f"Timeout waiting for storage deletion responses for {deletion_key}")
deletion_info["all_successful"] = False
deletion_info["error_messages"].append("Timeout waiting for storage deletion")
# Check if all deletions succeeded
if not deletion_info["all_successful"]:
error_msg = f"Storage deletion failed: {'; '.join(deletion_info['error_messages'])}"
logger.error(error_msg)
# Clean up tracking
del self.pending_deletions[deletion_key]
return CollectionManagementResponse(
error=Error(
type="storage_deletion_error",
message=error_msg
),
timestamp=datetime.now().isoformat()
)
# All storage deletions succeeded, now delete metadata
logger.info(f"Storage deletions complete, removing metadata for {deletion_key}")
await self.table_store.delete_collection(request.user, request.collection)
# Clean up tracking
del self.pending_deletions[deletion_key]
return CollectionManagementResponse(
error=None,
timestamp=datetime.now().isoformat()
)
except Exception as e:
logger.error(f"Error deleting collection: {e}")
# Clean up tracking on error
if deletion_key in self.pending_deletions:
del self.pending_deletions[deletion_key]
raise RequestError(f"Failed to delete collection: {str(e)}")
async def on_storage_response(self, response: StorageManagementResponse):
"""
Handle storage management responses for deletion tracking
Args:
response: Storage management response
"""
logger.debug(f"Received storage response: error={response.error}")
# Find matching deletion by checking all pending deletions
# Note: This is simplified correlation - in production we'd want better correlation
for deletion_key, info in list(self.pending_deletions.items()):
if info["responses_pending"] > 0:
# Record this response
info["responses_received"].append(response)
info["responses_pending"] -= 1
# Check if this response indicates failure
if response.error and response.error.message:
info["all_successful"] = False
info["error_messages"].append(response.error.message)
logger.warning(f"Storage deletion failed for {deletion_key}: {response.error.message}")
else:
logger.debug(f"Storage deletion succeeded for {deletion_key}")
# If all responses received, signal completion
if info["responses_pending"] == 0:
logger.info(f"All storage responses received for {deletion_key}")
info["deletion_complete"].set()
break # Only process for first matching deletion

View file

@ -1,362 +0,0 @@
"""
Collection management service for the librarian
"""
import asyncio
import logging
from datetime import datetime
from .. base import AsyncProcessor, Consumer, Producer
from .. base import ConsumerMetrics, ProducerMetrics
from .. base.cassandra_config import add_cassandra_args, resolve_cassandra_config
from .. schema import CollectionManagementRequest, CollectionManagementResponse, Error
from .. schema import collection_request_queue, collection_response_queue
from .. schema import CollectionMetadata
from .. schema import StorageManagementRequest, StorageManagementResponse
from .. schema import vector_storage_management_topic, object_storage_management_topic, triples_storage_management_topic, storage_management_response_topic
from .. exceptions import RequestError
from .. tables.library import LibraryTableStore
# Module logger
logger = logging.getLogger(__name__)
default_ident = "collection-management"
default_cassandra_host = "cassandra"
keyspace = "librarian"
class Processor(AsyncProcessor):
def __init__(self, **params):
id = params.get("id", default_ident)
# Get Cassandra configuration
cassandra_host = params.get("cassandra_host", default_cassandra_host)
cassandra_username = params.get("cassandra_username")
cassandra_password = params.get("cassandra_password")
# Resolve configuration with environment variable fallback
hosts, username, password = resolve_cassandra_config(
host=cassandra_host,
username=cassandra_username,
password=cassandra_password
)
super(Processor, self).__init__(
**params | {
"cassandra_host": ','.join(hosts),
"cassandra_username": username
}
)
self.cassandra_host = hosts
self.cassandra_username = username
self.cassandra_password = password
# Set up metrics
collection_request_metrics = ConsumerMetrics(
processor=self.id, flow=None, name="collection-request"
)
collection_response_metrics = ProducerMetrics(
processor=self.id, flow=None, name="collection-response"
)
# Set up consumer for collection management requests
self.collection_request_consumer = Consumer(
taskgroup=self.taskgroup,
client=self.pulsar_client,
flow=None,
topic=collection_request_queue,
subscriber=id,
schema=CollectionManagementRequest,
handler=self.on_collection_request,
metrics=collection_request_metrics,
)
# Set up producer for collection management responses
self.collection_response_producer = Producer(
client=self.pulsar_client,
topic=collection_response_queue,
schema=CollectionManagementResponse,
metrics=collection_response_metrics,
)
# Set up producers for storage management requests
self.vector_storage_producer = Producer(
client=self.pulsar_client,
topic=vector_storage_management_topic,
schema=StorageManagementRequest,
)
self.object_storage_producer = Producer(
client=self.pulsar_client,
topic=object_storage_management_topic,
schema=StorageManagementRequest,
)
self.triples_storage_producer = Producer(
client=self.pulsar_client,
topic=triples_storage_management_topic,
schema=StorageManagementRequest,
)
# Set up consumer for storage management responses
storage_response_metrics = ConsumerMetrics(
processor=self.id, flow=None, name="storage-response"
)
self.storage_response_consumer = Consumer(
taskgroup=self.taskgroup,
client=self.pulsar_client,
flow=None,
topic=storage_management_response_topic,
subscriber=f"{id}-storage",
schema=StorageManagementResponse,
handler=self.on_storage_response,
metrics=storage_response_metrics,
)
# Initialize table store
self.table_store = LibraryTableStore(
cassandra_host=self.cassandra_host,
cassandra_username=self.cassandra_username,
cassandra_password=self.cassandra_password,
keyspace=keyspace
)
# Track pending deletion requests by user+collection
self.pending_deletions = {} # (user, collection) -> {responses_pending, responses_received, all_successful, error_messages, deletion_complete}
async def on_collection_request(self, message):
"""Handle collection management requests"""
logger.debug(f"Collection request: {message.operation}")
try:
if message.operation == "list-collections":
response = await self.handle_list_collections(message)
elif message.operation == "update-collection":
response = await self.handle_update_collection(message)
elif message.operation == "delete-collection":
response = await self.handle_delete_collection(message)
else:
response = CollectionManagementResponse(
success="false",
error=Error(
type="invalid_operation",
message=f"Unknown operation: {message.operation}"
),
timestamp=datetime.now().isoformat()
)
except Exception as e:
logger.error(f"Error processing collection request: {e}", exc_info=True)
response = CollectionManagementResponse(
success="false",
error=Error(
type="processing_error",
message=str(e)
),
timestamp=datetime.now().isoformat()
)
await self.collection_response_producer.send(response)
async def on_storage_response(self, response):
"""Handle storage management responses"""
logger.debug(f"Received storage response: error={response.error}")
# Find matching deletion by checking all pending deletions
# Note: This is simplified correlation - assumes responses come back quickly
# In production, we'd want better correlation mechanism
for deletion_key, info in list(self.pending_deletions.items()):
if info["responses_pending"] > 0:
# Record this response
info["responses_received"].append(response)
info["responses_pending"] -= 1
# Check if this response indicates failure
if response.error and response.error.message:
info["all_successful"] = False
info["error_messages"].append(response.error.message)
logger.warning(f"Storage deletion failed for {deletion_key}: {response.error.message}")
else:
logger.debug(f"Storage deletion succeeded for {deletion_key}")
# If all responses received, signal completion
if info["responses_pending"] == 0:
logger.info(f"All storage responses received for {deletion_key}")
info["deletion_complete"].set()
break # Only process for first matching deletion
# For now, we'll correlate by user+collection since we don't have deletion_id in the response
# This is a simplified approach - in production we'd want better correlation
for deletion_id, info in list(self.pending_deletions.items()):
if info["responses_pending"] > 0:
# Record this response
info["responses_received"].append(response)
info["responses_pending"] -= 1
# Check if this response indicates failure
if response.error and response.error.message:
info["all_successful"] = False
info["error_messages"].append(response.error.message)
logger.warning(f"Storage deletion failed for {deletion_id}: {response.error.message}")
# If all responses received, signal completion
if info["responses_pending"] == 0:
logger.info(f"All storage responses received for {deletion_id}")
info["deletion_complete"].set()
break # Only process for first matching deletion
async def handle_list_collections(self, message):
"""Handle list collections request"""
try:
tag_filter = list(message.tag_filter) if message.tag_filter else None
collections = await self.table_store.list_collections(message.user, tag_filter)
collection_metadata = [
CollectionMetadata(
user=coll["user"],
collection=coll["collection"],
name=coll["name"],
description=coll["description"],
tags=coll["tags"],
created_at=coll["created_at"],
updated_at=coll["updated_at"]
)
for coll in collections
]
return CollectionManagementResponse(
success="true",
collections=collection_metadata,
timestamp=datetime.now().isoformat()
)
except Exception as e:
logger.error(f"Error listing collections: {e}")
raise
async def handle_update_collection(self, message):
"""Handle update collection request"""
try:
# Extract fields for update
name = message.name if message.name else None
description = message.description if message.description else None
tags = list(message.tags) if message.tags else None
updated_collection = await self.table_store.update_collection(
message.user, message.collection, name, description, tags
)
collection_metadata = CollectionMetadata(
user=updated_collection["user"],
collection=updated_collection["collection"],
name=updated_collection["name"],
description=updated_collection["description"],
tags=updated_collection["tags"],
created_at="", # Not returned by update
updated_at=updated_collection["updated_at"]
)
return CollectionManagementResponse(
success="true",
collections=[collection_metadata],
timestamp=datetime.now().isoformat()
)
except Exception as e:
logger.error(f"Error updating collection: {e}")
raise
async def handle_delete_collection(self, message):
"""Handle delete collection request with cascade to all storage types"""
try:
deletion_key = (message.user, message.collection)
logger.info(f"Starting cascade deletion for {message.user}/{message.collection}")
# Track this deletion request
self.pending_deletions[deletion_key] = {
"responses_pending": 3, # vector, object, triples
"responses_received": [],
"all_successful": True,
"error_messages": [],
"deletion_complete": asyncio.Event()
}
# Create storage management request
storage_request = StorageManagementRequest(
operation="delete-collection",
user=message.user,
collection=message.collection
)
# Send delete requests to all three storage types
await self.vector_storage_producer.send(storage_request)
await self.object_storage_producer.send(storage_request)
await self.triples_storage_producer.send(storage_request)
logger.info(f"Storage deletion requests sent for {message.user}/{message.collection}")
# Wait for all storage responses (with timeout)
try:
await asyncio.wait_for(
self.pending_deletions[deletion_key]["deletion_complete"].wait(),
timeout=30.0 # 30 second timeout
)
except asyncio.TimeoutError:
logger.error(f"Timeout waiting for storage responses for {deletion_key}")
self.pending_deletions[deletion_key]["all_successful"] = False
self.pending_deletions[deletion_key]["error_messages"].append("Timeout waiting for storage responses")
# Check if all storage deletions were successful
deletion_info = self.pending_deletions.pop(deletion_key, {})
if deletion_info.get("all_successful", False):
# All storage deletions succeeded, now delete metadata
await self.table_store.delete_collection_metadata(message.user, message.collection)
logger.info(f"Successfully completed cascade deletion for {message.user}/{message.collection}")
return CollectionManagementResponse(
success="true",
timestamp=datetime.now().isoformat()
)
else:
# Some storage deletions failed
error_messages = deletion_info.get("error_messages", ["Unknown storage deletion error"])
error_msg = "; ".join(error_messages)
logger.error(f"Cascade deletion failed for {deletion_key}: {error_msg}")
return CollectionManagementResponse(
success="false",
error=Error(
type="storage_deletion_error",
message=f"Storage deletion failed: {error_msg}"
),
timestamp=datetime.now().isoformat()
)
except Exception as e:
logger.error(f"Error in cascade deletion: {e}")
return CollectionManagementResponse(
success="false",
error=Error(
type="deletion_error",
message=f"Failed to delete collection: {str(e)}"
),
timestamp=datetime.now().isoformat()
)
@staticmethod
def add_args(parser):
AsyncProcessor.add_args(parser)
add_cassandra_args(parser)
def run():
Processor.launch(default_ident, __doc__)

View file

@ -8,6 +8,7 @@ import asyncio
import base64
import json
import logging
from datetime import datetime
from .. base import AsyncProcessor, Consumer, Producer, Publisher, Subscriber
from .. base import ConsumerMetrics, ProducerMetrics
@ -15,6 +16,11 @@ from .. base.cassandra_config import add_cassandra_args, resolve_cassandra_confi
from .. schema import LibrarianRequest, LibrarianResponse, Error
from .. schema import librarian_request_queue, librarian_response_queue
from .. schema import CollectionManagementRequest, CollectionManagementResponse
from .. schema import collection_request_queue, collection_response_queue
from .. schema import StorageManagementRequest, StorageManagementResponse
from .. schema import vector_storage_management_topic, object_storage_management_topic
from .. schema import triples_storage_management_topic, storage_management_response_topic
from .. schema import Document, Metadata
from .. schema import TextDocument, Metadata
@ -22,6 +28,7 @@ from .. schema import TextDocument, Metadata
from .. exceptions import RequestError
from . librarian import Librarian
from . collection_manager import CollectionManager
# Module logger
logger = logging.getLogger(__name__)
@ -30,6 +37,8 @@ default_ident = "librarian"
default_librarian_request_queue = librarian_request_queue
default_librarian_response_queue = librarian_response_queue
default_collection_request_queue = collection_request_queue
default_collection_response_queue = collection_response_queue
default_minio_host = "minio:9000"
default_minio_access_key = "minioadmin"
@ -57,6 +66,14 @@ class Processor(AsyncProcessor):
"librarian_response_queue", default_librarian_response_queue
)
collection_request_queue = params.get(
"collection_request_queue", default_collection_request_queue
)
collection_response_queue = params.get(
"collection_response_queue", default_collection_response_queue
)
minio_host = params.get("minio_host", default_minio_host)
minio_access_key = params.get(
"minio_access_key",
@ -87,6 +104,8 @@ class Processor(AsyncProcessor):
**params | {
"librarian_request_queue": librarian_request_queue,
"librarian_response_queue": librarian_response_queue,
"collection_request_queue": collection_request_queue,
"collection_response_queue": collection_response_queue,
"minio_host": minio_host,
"minio_access_key": minio_access_key,
"cassandra_host": self.cassandra_host,
@ -103,6 +122,18 @@ class Processor(AsyncProcessor):
processor = self.id, flow = None, name = "librarian-response"
)
collection_request_metrics = ConsumerMetrics(
processor = self.id, flow = None, name = "collection-request"
)
collection_response_metrics = ProducerMetrics(
processor = self.id, flow = None, name = "collection-response"
)
storage_response_metrics = ConsumerMetrics(
processor = self.id, flow = None, name = "storage-response"
)
self.librarian_request_consumer = Consumer(
taskgroup = self.taskgroup,
client = self.pulsar_client,
@ -121,6 +152,54 @@ class Processor(AsyncProcessor):
metrics = librarian_response_metrics,
)
self.collection_request_consumer = Consumer(
taskgroup = self.taskgroup,
client = self.pulsar_client,
flow = None,
topic = collection_request_queue,
subscriber = id,
schema = CollectionManagementRequest,
handler = self.on_collection_request,
metrics = collection_request_metrics,
)
self.collection_response_producer = Producer(
client = self.pulsar_client,
topic = collection_response_queue,
schema = CollectionManagementResponse,
metrics = collection_response_metrics,
)
# Storage management producers for collection deletion
self.vector_storage_producer = Producer(
client = self.pulsar_client,
topic = vector_storage_management_topic,
schema = StorageManagementRequest,
)
self.object_storage_producer = Producer(
client = self.pulsar_client,
topic = object_storage_management_topic,
schema = StorageManagementRequest,
)
self.triples_storage_producer = Producer(
client = self.pulsar_client,
topic = triples_storage_management_topic,
schema = StorageManagementRequest,
)
self.storage_response_consumer = Consumer(
taskgroup = self.taskgroup,
client = self.pulsar_client,
flow = None,
topic = storage_management_response_topic,
subscriber = id,
schema = StorageManagementResponse,
handler = self.on_storage_response,
metrics = storage_response_metrics,
)
self.librarian = Librarian(
cassandra_host = self.cassandra_host,
cassandra_username = self.cassandra_username,
@ -133,6 +212,17 @@ class Processor(AsyncProcessor):
load_document = self.load_document,
)
self.collection_manager = CollectionManager(
cassandra_host = self.cassandra_host,
cassandra_username = self.cassandra_username,
cassandra_password = self.cassandra_password,
keyspace = keyspace,
vector_storage_producer = self.vector_storage_producer,
object_storage_producer = self.object_storage_producer,
triples_storage_producer = self.triples_storage_producer,
storage_response_consumer = self.storage_response_consumer,
)
self.register_config_handler(self.on_librarian_config)
self.flows = {}
@ -144,6 +234,12 @@ class Processor(AsyncProcessor):
await super(Processor, self).start()
await self.librarian_request_consumer.start()
await self.librarian_response_producer.start()
await self.collection_request_consumer.start()
await self.collection_response_producer.start()
await self.vector_storage_producer.start()
await self.object_storage_producer.start()
await self.triples_storage_producer.start()
await self.storage_response_consumer.start()
async def on_librarian_config(self, config, version):
@ -223,6 +319,19 @@ class Processor(AsyncProcessor):
logger.debug("Document submitted")
async def add_processing_with_collection(self, request):
"""
Wrapper for add_processing that ensures collection exists
"""
# Ensure collection exists when processing is added
if hasattr(request, 'processing_metadata') and request.processing_metadata:
user = request.processing_metadata.user
collection = request.processing_metadata.collection
await self.collection_manager.ensure_collection_exists(user, collection)
# Call the original add_processing method
return await self.librarian.add_processing(request)
async def process_request(self, v):
if v.operation is None:
@ -236,7 +345,7 @@ class Processor(AsyncProcessor):
"update-document": self.librarian.update_document,
"get-document-metadata": self.librarian.get_document_metadata,
"get-document-content": self.librarian.get_document_content,
"add-processing": self.librarian.add_processing,
"add-processing": self.add_processing_with_collection,
"remove-processing": self.librarian.remove_processing,
"list-documents": self.librarian.list_documents,
"list-processing": self.librarian.list_processing,
@ -296,6 +405,73 @@ class Processor(AsyncProcessor):
logger.debug("Librarian input processing complete")
async def process_collection_request(self, v):
"""
Process collection management requests
"""
if v.operation is None:
raise RequestError("Null operation")
logger.debug(f"Collection request: {v.operation}")
impls = {
"list-collections": self.collection_manager.list_collections,
"update-collection": self.collection_manager.update_collection,
"delete-collection": self.collection_manager.delete_collection,
}
if v.operation not in impls:
raise RequestError(f"Invalid collection operation: {v.operation}")
return await impls[v.operation](v)
async def on_collection_request(self, msg, consumer, flow):
"""
Handle collection management request messages
"""
v = msg.value()
id = msg.properties().get("id", "unknown")
logger.info(f"Handling collection request {id}...")
try:
resp = await self.process_collection_request(v)
await self.collection_response_producer.send(
resp, properties={"id": id}
)
except RequestError as e:
resp = CollectionManagementResponse(
error=Error(
type="request-error",
message=str(e),
),
timestamp=datetime.now().isoformat()
)
await self.collection_response_producer.send(
resp, properties={"id": id}
)
except Exception as e:
resp = CollectionManagementResponse(
error=Error(
type="unexpected-error",
message=str(e),
),
timestamp=datetime.now().isoformat()
)
await self.collection_response_producer.send(
resp, properties={"id": id}
)
logger.debug("Collection request processing complete")
async def on_storage_response(self, msg, consumer, flow):
"""
Handle storage management response messages
"""
v = msg.value()
logger.debug("Received storage management response")
await self.collection_manager.on_storage_response(v)
@staticmethod
def add_args(parser):
@ -313,6 +489,18 @@ class Processor(AsyncProcessor):
help=f'Config response queue {default_librarian_response_queue}',
)
parser.add_argument(
'--collection-request-queue',
default=default_collection_request_queue,
help=f'Collection request queue (default: {default_collection_request_queue})'
)
parser.add_argument(
'--collection-response-queue',
default=default_collection_response_queue,
help=f'Collection response queue (default: {default_collection_response_queue})'
)
parser.add_argument(
'--minio-host',
default=default_minio_host,

View file

@ -67,8 +67,7 @@ class Processor(DocumentEmbeddingsQueryService):
dim = len(vec)
collection = (
"d_" + msg.user + "_" + msg.collection + "_" +
str(dim)
"d_" + msg.user + "_" + msg.collection
)
self.ensure_collection_exists(collection, dim)

View file

@ -74,8 +74,7 @@ class Processor(GraphEmbeddingsQueryService):
dim = len(vec)
collection = (
"t_" + msg.user + "_" + msg.collection + "_" +
str(dim)
"t_" + msg.user + "_" + msg.collection
)
self.ensure_collection_exists(collection, dim)

View file

@ -654,7 +654,7 @@ class LibraryTableStore:
logger.error(f"Error updating collection: {e}")
raise
async def delete_collection_metadata(self, user, collection):
async def delete_collection(self, user, collection):
"""Delete collection metadata record"""
try:
await asyncio.get_event_loop().run_in_executor(
@ -683,3 +683,35 @@ class LibraryTableStore:
except Exception as e:
logger.error(f"Error getting collection: {e}")
raise
async def create_collection(self, user, collection, name=None, description=None, tags=None):
"""Create a new collection metadata record"""
try:
import datetime
now = datetime.datetime.now()
# Set defaults for optional parameters
name = name if name is not None else collection
description = description if description is not None else ""
tags = tags if tags is not None else set()
await asyncio.get_event_loop().run_in_executor(
None, self.cassandra.execute, self.insert_collection_stmt,
[user, collection, name, description, tags, now, now]
)
logger.info(f"Created collection {user}/{collection}")
# Return the created collection data
return {
"user": user,
"collection": collection,
"name": name,
"description": description,
"tags": list(tags) if isinstance(tags, set) else tags,
"created_at": now.isoformat(),
"updated_at": now.isoformat()
}
except Exception as e:
logger.error(f"Error creating collection: {e}")
raise