Collection management (#520)

* Tech spec

* Refactored Cassanda knowledge graph for single table

* Collection management, librarian services to manage metadata and collection deletion
This commit is contained in:
cybermaggedon 2025-09-18 15:57:52 +01:00 committed by GitHub
parent 48016d8fb2
commit 13ff7d765d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
48 changed files with 2941 additions and 425 deletions

View file

@ -12,6 +12,10 @@ import os
import logging
from .... base import DocumentEmbeddingsStoreService
from .... base import AsyncProcessor, Consumer, Producer
from .... base import ConsumerMetrics, ProducerMetrics
from .... schema import StorageManagementRequest, StorageManagementResponse, Error
from .... schema import vector_storage_management_topic, storage_management_response_topic
# Module logger
logger = logging.getLogger(__name__)
@ -55,6 +59,34 @@ class Processor(DocumentEmbeddingsStoreService):
self.last_index_name = None
# Set up metrics for storage management
storage_request_metrics = ConsumerMetrics(
processor=self.id, flow=None, name="storage-request"
)
storage_response_metrics = ProducerMetrics(
processor=self.id, flow=None, name="storage-response"
)
# Set up consumer for storage management requests
self.storage_request_consumer = Consumer(
taskgroup=self.taskgroup,
client=self.pulsar_client,
flow=None,
topic=vector_storage_management_topic,
subscriber=f"{self.id}-storage",
schema=StorageManagementRequest,
handler=self.on_storage_management,
metrics=storage_request_metrics,
)
# Set up producer for storage management responses
self.storage_response_producer = Producer(
client=self.pulsar_client,
topic=storage_management_response_topic,
schema=StorageManagementResponse,
metrics=storage_response_metrics,
)
def create_index(self, index_name, dim):
self.pinecone.create_index(
@ -96,7 +128,7 @@ class Processor(DocumentEmbeddingsStoreService):
dim = len(vec)
index_name = (
"d-" + message.metadata.user + "-" + message.metadata.collection + "-" + str(dim)
"d-" + message.metadata.user + "-" + message.metadata.collection
)
if index_name != self.last_index_name:
@ -160,6 +192,54 @@ class Processor(DocumentEmbeddingsStoreService):
help=f'Pinecone region, (default: {default_region}'
)
async def on_storage_management(self, message):
"""Handle storage management requests"""
logger.info(f"Storage management request: {message.operation} for {message.user}/{message.collection}")
try:
if message.operation == "delete-collection":
await self.handle_delete_collection(message)
else:
response = StorageManagementResponse(
error=Error(
type="invalid_operation",
message=f"Unknown operation: {message.operation}"
)
)
await self.storage_response_producer.send(response)
except Exception as e:
logger.error(f"Error processing storage management request: {e}", exc_info=True)
response = StorageManagementResponse(
error=Error(
type="processing_error",
message=str(e)
)
)
await self.storage_response_producer.send(response)
async def handle_delete_collection(self, message):
"""Delete the collection for document embeddings"""
try:
index_name = f"d-{message.user}-{message.collection}"
if self.pinecone.has_index(index_name):
self.pinecone.delete_index(index_name)
logger.info(f"Deleted Pinecone index: {index_name}")
else:
logger.info(f"Index {index_name} does not exist, nothing to delete")
# Send success response
response = StorageManagementResponse(
error=None # No error means success
)
await self.storage_response_producer.send(response)
logger.info(f"Successfully deleted collection {message.user}/{message.collection}")
except Exception as e:
logger.error(f"Failed to delete collection: {e}")
raise
def run():
Processor.launch(default_ident, __doc__)