mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-29 02:23:44 +02:00
Address legacy issues in storage management (#595)
* Removed legacy storage management cruft. Tidied tech specs. * Fix deletion of last collection * Storage processor ignores data on the queue which is for a deleted collection * Updated tests
This commit is contained in:
parent
25563bae3c
commit
ae13190093
12 changed files with 188 additions and 264 deletions
|
|
@ -90,6 +90,15 @@ class Processor(CollectionConfigHandler, DocumentEmbeddingsStoreService):
|
|||
|
||||
async def store_document_embeddings(self, message):
|
||||
|
||||
# Validate collection exists in config before processing
|
||||
if not self.collection_exists(message.metadata.user, message.metadata.collection):
|
||||
logger.warning(
|
||||
f"Collection {message.metadata.collection} for user {message.metadata.user} "
|
||||
f"does not exist in config (likely deleted while data was in-flight). "
|
||||
f"Dropping message."
|
||||
)
|
||||
return
|
||||
|
||||
for emb in message.chunks:
|
||||
|
||||
if emb.chunk is None or emb.chunk == b"": continue
|
||||
|
|
@ -105,7 +114,7 @@ class Processor(CollectionConfigHandler, DocumentEmbeddingsStoreService):
|
|||
f"d-{message.metadata.user}-{message.metadata.collection}-{dim}"
|
||||
)
|
||||
|
||||
# Lazily create index if it doesn't exist
|
||||
# Lazily create index if it doesn't exist (but only if authorized in config)
|
||||
if not self.pinecone.has_index(index_name):
|
||||
logger.info(f"Lazily creating Pinecone index {index_name} with dimension {dim}")
|
||||
self.create_index(index_name, dim)
|
||||
|
|
|
|||
|
|
@ -41,6 +41,15 @@ class Processor(CollectionConfigHandler, DocumentEmbeddingsStoreService):
|
|||
|
||||
async def store_document_embeddings(self, message):
|
||||
|
||||
# Validate collection exists in config before processing
|
||||
if not self.collection_exists(message.metadata.user, message.metadata.collection):
|
||||
logger.warning(
|
||||
f"Collection {message.metadata.collection} for user {message.metadata.user} "
|
||||
f"does not exist in config (likely deleted while data was in-flight). "
|
||||
f"Dropping message."
|
||||
)
|
||||
return
|
||||
|
||||
for emb in message.chunks:
|
||||
|
||||
chunk = emb.chunk.decode("utf-8")
|
||||
|
|
@ -54,7 +63,7 @@ class Processor(CollectionConfigHandler, DocumentEmbeddingsStoreService):
|
|||
f"d_{message.metadata.user}_{message.metadata.collection}_{dim}"
|
||||
)
|
||||
|
||||
# Lazily create collection if it doesn't exist
|
||||
# Lazily create collection if it doesn't exist (but only if authorized in config)
|
||||
if not self.qdrant.collection_exists(collection):
|
||||
logger.info(f"Lazily creating Qdrant collection {collection} with dimension {dim}")
|
||||
self.qdrant.create_collection(
|
||||
|
|
|
|||
|
|
@ -90,6 +90,15 @@ class Processor(CollectionConfigHandler, GraphEmbeddingsStoreService):
|
|||
|
||||
async def store_graph_embeddings(self, message):
|
||||
|
||||
# Validate collection exists in config before processing
|
||||
if not self.collection_exists(message.metadata.user, message.metadata.collection):
|
||||
logger.warning(
|
||||
f"Collection {message.metadata.collection} for user {message.metadata.user} "
|
||||
f"does not exist in config (likely deleted while data was in-flight). "
|
||||
f"Dropping message."
|
||||
)
|
||||
return
|
||||
|
||||
for entity in message.entities:
|
||||
|
||||
if entity.entity.value == "" or entity.entity.value is None:
|
||||
|
|
@ -103,7 +112,7 @@ class Processor(CollectionConfigHandler, GraphEmbeddingsStoreService):
|
|||
f"t-{message.metadata.user}-{message.metadata.collection}-{dim}"
|
||||
)
|
||||
|
||||
# Lazily create index if it doesn't exist
|
||||
# Lazily create index if it doesn't exist (but only if authorized in config)
|
||||
if not self.pinecone.has_index(index_name):
|
||||
logger.info(f"Lazily creating Pinecone index {index_name} with dimension {dim}")
|
||||
self.create_index(index_name, dim)
|
||||
|
|
|
|||
|
|
@ -41,6 +41,15 @@ class Processor(CollectionConfigHandler, GraphEmbeddingsStoreService):
|
|||
|
||||
async def store_graph_embeddings(self, message):
|
||||
|
||||
# Validate collection exists in config before processing
|
||||
if not self.collection_exists(message.metadata.user, message.metadata.collection):
|
||||
logger.warning(
|
||||
f"Collection {message.metadata.collection} for user {message.metadata.user} "
|
||||
f"does not exist in config (likely deleted while data was in-flight). "
|
||||
f"Dropping message."
|
||||
)
|
||||
return
|
||||
|
||||
for entity in message.entities:
|
||||
|
||||
if entity.entity.value == "" or entity.entity.value is None: return
|
||||
|
|
@ -53,7 +62,7 @@ class Processor(CollectionConfigHandler, GraphEmbeddingsStoreService):
|
|||
f"t_{message.metadata.user}_{message.metadata.collection}_{dim}"
|
||||
)
|
||||
|
||||
# Lazily create collection if it doesn't exist
|
||||
# Lazily create collection if it doesn't exist (but only if authorized in config)
|
||||
if not self.qdrant.collection_exists(collection):
|
||||
logger.info(f"Lazily creating Qdrant collection {collection} with dimension {dim}")
|
||||
self.qdrant.create_collection(
|
||||
|
|
|
|||
|
|
@ -13,9 +13,8 @@ from cassandra import ConsistencyLevel
|
|||
|
||||
from .... schema import ExtractedObject
|
||||
from .... schema import RowSchema, Field
|
||||
from .... schema import StorageManagementRequest, StorageManagementResponse
|
||||
from .... schema import object_storage_management_topic, storage_management_response_topic
|
||||
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
||||
from .... base import CollectionConfigHandler
|
||||
from .... base.cassandra_config import add_cassandra_args, resolve_cassandra_config
|
||||
|
||||
# Module logger
|
||||
|
|
@ -23,7 +22,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
default_ident = "objects-write"
|
||||
|
||||
class Processor(FlowProcessor):
|
||||
class Processor(CollectionConfigHandler, FlowProcessor):
|
||||
|
||||
def __init__(self, **params):
|
||||
|
||||
|
|
@ -64,39 +63,9 @@ class Processor(FlowProcessor):
|
|||
)
|
||||
)
|
||||
|
||||
# Set up storage management consumer and producer directly
|
||||
# (FlowProcessor doesn't support topic-based specs outside of flows)
|
||||
from .... base import Consumer, Producer, ConsumerMetrics, ProducerMetrics
|
||||
|
||||
storage_request_metrics = ConsumerMetrics(
|
||||
processor=self.id, flow=None, name="storage-request"
|
||||
)
|
||||
storage_response_metrics = ProducerMetrics(
|
||||
processor=self.id, flow=None, name="storage-response"
|
||||
)
|
||||
|
||||
# Create storage management consumer
|
||||
self.storage_request_consumer = Consumer(
|
||||
taskgroup=self.taskgroup,
|
||||
backend=self.pubsub,
|
||||
flow=None,
|
||||
topic=object_storage_management_topic,
|
||||
subscriber=f"{id}-storage",
|
||||
schema=StorageManagementRequest,
|
||||
handler=self.on_storage_management,
|
||||
metrics=storage_request_metrics,
|
||||
)
|
||||
|
||||
# Create storage management response producer
|
||||
self.storage_response_producer = Producer(
|
||||
backend=self.pubsub,
|
||||
topic=storage_management_response_topic,
|
||||
schema=StorageManagementResponse,
|
||||
metrics=storage_response_metrics,
|
||||
)
|
||||
|
||||
# Register config handler for schema updates
|
||||
# Register config handlers
|
||||
self.register_config_handler(self.on_schema_config)
|
||||
self.register_config_handler(self.on_collection_config)
|
||||
|
||||
# Cache of known keyspaces/tables
|
||||
self.known_keyspaces: Set[str] = set()
|
||||
|
|
@ -347,28 +316,14 @@ class Processor(FlowProcessor):
|
|||
obj = msg.value()
|
||||
logger.info(f"Storing {len(obj.values)} objects for schema {obj.schema_name} from {obj.metadata.id}")
|
||||
|
||||
# Validate collection/keyspace exists before accepting writes
|
||||
safe_keyspace = self.sanitize_name(obj.metadata.user)
|
||||
if safe_keyspace not in self.known_keyspaces:
|
||||
# Check if keyspace actually exists in Cassandra
|
||||
self.connect_cassandra()
|
||||
check_keyspace_cql = """
|
||||
SELECT keyspace_name FROM system_schema.keyspaces
|
||||
WHERE keyspace_name = %s
|
||||
"""
|
||||
result = self.session.execute(check_keyspace_cql, (safe_keyspace,))
|
||||
# Check if result is None (mock case) or has no rows
|
||||
if result is None or not result.one():
|
||||
error_msg = (
|
||||
f"Collection {obj.metadata.collection} does not exist. "
|
||||
f"Create it first via collection management API."
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
# Cache it if it exists
|
||||
self.known_keyspaces.add(safe_keyspace)
|
||||
if safe_keyspace not in self.known_tables:
|
||||
self.known_tables[safe_keyspace] = set()
|
||||
# Validate collection exists before accepting writes
|
||||
if not self.collection_exists(obj.metadata.user, obj.metadata.collection):
|
||||
error_msg = (
|
||||
f"Collection {obj.metadata.collection} does not exist. "
|
||||
f"Create it first via collection management API."
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
# Get schema definition
|
||||
schema = self.schemas.get(obj.schema_name)
|
||||
|
|
@ -447,55 +402,7 @@ class Processor(FlowProcessor):
|
|||
logger.error(f"Failed to insert object {obj_index}: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
async def on_storage_management(self, msg, consumer, flow):
|
||||
"""Handle storage management requests for collection operations"""
|
||||
request = msg.value()
|
||||
logger.info(f"Received storage management request: {request.operation} for {request.user}/{request.collection}")
|
||||
|
||||
try:
|
||||
if request.operation == "create-collection":
|
||||
await self.create_collection(request.user, request.collection)
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(
|
||||
error=None # No error means success
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
logger.info(f"Successfully created collection {request.user}/{request.collection}")
|
||||
elif request.operation == "delete-collection":
|
||||
await self.delete_collection(request.user, request.collection)
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(
|
||||
error=None # No error means success
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
logger.info(f"Successfully deleted collection {request.user}/{request.collection}")
|
||||
else:
|
||||
logger.warning(f"Unknown storage management operation: {request.operation}")
|
||||
# Send error response
|
||||
from .... schema import Error
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="unknown_operation",
|
||||
message=f"Unknown operation: {request.operation}"
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error handling storage management request: {e}", exc_info=True)
|
||||
# Send error response
|
||||
from .... schema import Error
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="processing_error",
|
||||
message=str(e)
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def create_collection(self, user: str, collection: str):
|
||||
async def create_collection(self, user: str, collection: str, metadata: dict):
|
||||
"""Create/verify collection exists in Cassandra object store"""
|
||||
# Connect if not already connected
|
||||
self.connect_cassandra()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue