mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-29 02:23:44 +02:00
Collection delete pt. 3 (#542)
* Fixing collection deletion * Fixing collection management param error * Always test for collections * Add Cassandra collection table * Updated tech spec for explicit creation/deletion * Remove implicit collection creation * Fix up collection tracking in all processors
This commit is contained in:
parent
dc79b10552
commit
52b133fc86
31 changed files with 1761 additions and 843 deletions
|
|
@ -24,16 +24,12 @@ class KnowledgeGraph:
|
|||
self.keyspace = keyspace
|
||||
self.username = username
|
||||
|
||||
# Multi-table schema design for optimal performance
|
||||
self.use_legacy = os.getenv('CASSANDRA_USE_LEGACY', 'false').lower() == 'true'
|
||||
|
||||
if self.use_legacy:
|
||||
self.table = "triples" # Legacy single table
|
||||
else:
|
||||
# New optimized tables
|
||||
self.subject_table = "triples_s"
|
||||
self.po_table = "triples_p"
|
||||
self.object_table = "triples_o"
|
||||
# Optimized multi-table schema with collection deletion support
|
||||
self.subject_table = "triples_s"
|
||||
self.po_table = "triples_p"
|
||||
self.object_table = "triples_o"
|
||||
self.collection_table = "triples_collection" # For SPO queries and deletion
|
||||
self.collection_metadata_table = "collection_metadata" # For tracking which collections exist
|
||||
|
||||
if username and password:
|
||||
ssl_context = SSLContext(PROTOCOL_TLSv1_2)
|
||||
|
|
@ -47,9 +43,7 @@ class KnowledgeGraph:
|
|||
_active_clusters.append(self.cluster)
|
||||
|
||||
self.init()
|
||||
|
||||
if not self.use_legacy:
|
||||
self.prepare_statements()
|
||||
self.prepare_statements()
|
||||
|
||||
def clear(self):
|
||||
|
||||
|
|
@ -70,42 +64,13 @@ class KnowledgeGraph:
|
|||
""");
|
||||
|
||||
self.session.set_keyspace(self.keyspace)
|
||||
self.init_optimized_schema()
|
||||
|
||||
if self.use_legacy:
|
||||
self.init_legacy_schema()
|
||||
else:
|
||||
self.init_optimized_schema()
|
||||
|
||||
def init_legacy_schema(self):
|
||||
"""Initialize legacy single-table schema for backward compatibility"""
|
||||
self.session.execute(f"""
|
||||
create table if not exists {self.table} (
|
||||
collection text,
|
||||
s text,
|
||||
p text,
|
||||
o text,
|
||||
PRIMARY KEY (collection, s, p, o)
|
||||
);
|
||||
""");
|
||||
|
||||
self.session.execute(f"""
|
||||
create index if not exists {self.table}_s
|
||||
ON {self.table} (s);
|
||||
""");
|
||||
|
||||
self.session.execute(f"""
|
||||
create index if not exists {self.table}_p
|
||||
ON {self.table} (p);
|
||||
""");
|
||||
|
||||
self.session.execute(f"""
|
||||
create index if not exists {self.table}_o
|
||||
ON {self.table} (o);
|
||||
""");
|
||||
|
||||
def init_optimized_schema(self):
|
||||
"""Initialize optimized multi-table schema for performance"""
|
||||
# Table 1: Subject-centric queries (get_s, get_sp, get_spo, get_os)
|
||||
# Table 1: Subject-centric queries (get_s, get_sp, get_os)
|
||||
# Compound partition key for optimal data distribution
|
||||
self.session.execute(f"""
|
||||
CREATE TABLE IF NOT EXISTS {self.subject_table} (
|
||||
collection text,
|
||||
|
|
@ -117,6 +82,7 @@ class KnowledgeGraph:
|
|||
""");
|
||||
|
||||
# Table 2: Predicate-Object queries (get_p, get_po) - eliminates ALLOW FILTERING!
|
||||
# Compound partition key for optimal data distribution
|
||||
self.session.execute(f"""
|
||||
CREATE TABLE IF NOT EXISTS {self.po_table} (
|
||||
collection text,
|
||||
|
|
@ -128,6 +94,7 @@ class KnowledgeGraph:
|
|||
""");
|
||||
|
||||
# Table 3: Object-centric queries (get_o)
|
||||
# Compound partition key for optimal data distribution
|
||||
self.session.execute(f"""
|
||||
CREATE TABLE IF NOT EXISTS {self.object_table} (
|
||||
collection text,
|
||||
|
|
@ -138,7 +105,29 @@ class KnowledgeGraph:
|
|||
);
|
||||
""");
|
||||
|
||||
logger.info("Optimized multi-table schema initialized")
|
||||
# Table 4: Collection management and SPO queries (get_spo)
|
||||
# Simple partition key enables efficient collection deletion
|
||||
self.session.execute(f"""
|
||||
CREATE TABLE IF NOT EXISTS {self.collection_table} (
|
||||
collection text,
|
||||
s text,
|
||||
p text,
|
||||
o text,
|
||||
PRIMARY KEY (collection, s, p, o)
|
||||
);
|
||||
""");
|
||||
|
||||
# Table 5: Collection metadata tracking
|
||||
# Tracks which collections exist without polluting triple data
|
||||
self.session.execute(f"""
|
||||
CREATE TABLE IF NOT EXISTS {self.collection_metadata_table} (
|
||||
collection text,
|
||||
created_at timestamp,
|
||||
PRIMARY KEY (collection)
|
||||
);
|
||||
""");
|
||||
|
||||
logger.info("Optimized multi-table schema initialized (5 tables)")
|
||||
|
||||
def prepare_statements(self):
|
||||
"""Prepare statements for optimal performance"""
|
||||
|
|
@ -155,6 +144,10 @@ class KnowledgeGraph:
|
|||
f"INSERT INTO {self.object_table} (collection, o, s, p) VALUES (?, ?, ?, ?)"
|
||||
)
|
||||
|
||||
self.insert_collection_stmt = self.session.prepare(
|
||||
f"INSERT INTO {self.collection_table} (collection, s, p, o) VALUES (?, ?, ?, ?)"
|
||||
)
|
||||
|
||||
# Query statements for optimized access
|
||||
self.get_all_stmt = self.session.prepare(
|
||||
f"SELECT s, p, o FROM {self.subject_table} WHERE collection = ? LIMIT ? ALLOW FILTERING"
|
||||
|
|
@ -186,158 +179,168 @@ class KnowledgeGraph:
|
|||
)
|
||||
|
||||
self.get_spo_stmt = self.session.prepare(
|
||||
f"SELECT s as x FROM {self.subject_table} WHERE collection = ? AND s = ? AND p = ? AND o = ? LIMIT ?"
|
||||
f"SELECT s as x FROM {self.collection_table} WHERE collection = ? AND s = ? AND p = ? AND o = ? LIMIT ?"
|
||||
)
|
||||
|
||||
logger.info("Prepared statements initialized for optimal performance")
|
||||
logger.info("Prepared statements initialized for optimal performance (4 tables)")
|
||||
|
||||
def insert(self, collection, s, p, o):
|
||||
# Batch write to all four tables for consistency
|
||||
batch = BatchStatement()
|
||||
|
||||
if self.use_legacy:
|
||||
self.session.execute(
|
||||
f"insert into {self.table} (collection, s, p, o) values (%s, %s, %s, %s)",
|
||||
(collection, s, p, o)
|
||||
)
|
||||
else:
|
||||
# Batch write to all three tables for consistency
|
||||
batch = BatchStatement()
|
||||
# Insert into subject table
|
||||
batch.add(self.insert_subject_stmt, (collection, s, p, o))
|
||||
|
||||
# Insert into subject table
|
||||
batch.add(self.insert_subject_stmt, (collection, s, p, o))
|
||||
# Insert into predicate-object table (column order: collection, p, o, s)
|
||||
batch.add(self.insert_po_stmt, (collection, p, o, s))
|
||||
|
||||
# Insert into predicate-object table (column order: collection, p, o, s)
|
||||
batch.add(self.insert_po_stmt, (collection, p, o, s))
|
||||
# Insert into object table (column order: collection, o, s, p)
|
||||
batch.add(self.insert_object_stmt, (collection, o, s, p))
|
||||
|
||||
# Insert into object table (column order: collection, o, s, p)
|
||||
batch.add(self.insert_object_stmt, (collection, o, s, p))
|
||||
# Insert into collection table for SPO queries and deletion tracking
|
||||
batch.add(self.insert_collection_stmt, (collection, s, p, o))
|
||||
|
||||
self.session.execute(batch)
|
||||
self.session.execute(batch)
|
||||
|
||||
def get_all(self, collection, limit=50):
|
||||
if self.use_legacy:
|
||||
return self.session.execute(
|
||||
f"select s, p, o from {self.table} where collection = %s limit {limit}",
|
||||
(collection,)
|
||||
)
|
||||
else:
|
||||
# Use subject table for get_all queries
|
||||
return self.session.execute(
|
||||
self.get_all_stmt,
|
||||
(collection, limit)
|
||||
)
|
||||
# Use subject table for get_all queries
|
||||
return self.session.execute(
|
||||
self.get_all_stmt,
|
||||
(collection, limit)
|
||||
)
|
||||
|
||||
def get_s(self, collection, s, limit=10):
|
||||
if self.use_legacy:
|
||||
return self.session.execute(
|
||||
f"select p, o from {self.table} where collection = %s and s = %s limit {limit}",
|
||||
(collection, s)
|
||||
)
|
||||
else:
|
||||
# Optimized: Direct partition access with (collection, s)
|
||||
return self.session.execute(
|
||||
self.get_s_stmt,
|
||||
(collection, s, limit)
|
||||
)
|
||||
# Optimized: Direct partition access with (collection, s)
|
||||
return self.session.execute(
|
||||
self.get_s_stmt,
|
||||
(collection, s, limit)
|
||||
)
|
||||
|
||||
def get_p(self, collection, p, limit=10):
|
||||
if self.use_legacy:
|
||||
return self.session.execute(
|
||||
f"select s, o from {self.table} where collection = %s and p = %s limit {limit}",
|
||||
(collection, p)
|
||||
)
|
||||
else:
|
||||
# Optimized: Use po_table for direct partition access
|
||||
return self.session.execute(
|
||||
self.get_p_stmt,
|
||||
(collection, p, limit)
|
||||
)
|
||||
# Optimized: Use po_table for direct partition access
|
||||
return self.session.execute(
|
||||
self.get_p_stmt,
|
||||
(collection, p, limit)
|
||||
)
|
||||
|
||||
def get_o(self, collection, o, limit=10):
|
||||
if self.use_legacy:
|
||||
return self.session.execute(
|
||||
f"select s, p from {self.table} where collection = %s and o = %s limit {limit}",
|
||||
(collection, o)
|
||||
)
|
||||
else:
|
||||
# Optimized: Use object_table for direct partition access
|
||||
return self.session.execute(
|
||||
self.get_o_stmt,
|
||||
(collection, o, limit)
|
||||
)
|
||||
# Optimized: Use object_table for direct partition access
|
||||
return self.session.execute(
|
||||
self.get_o_stmt,
|
||||
(collection, o, limit)
|
||||
)
|
||||
|
||||
def get_sp(self, collection, s, p, limit=10):
|
||||
if self.use_legacy:
|
||||
return self.session.execute(
|
||||
f"select o from {self.table} where collection = %s and s = %s and p = %s limit {limit}",
|
||||
(collection, s, p)
|
||||
)
|
||||
else:
|
||||
# Optimized: Use subject_table with clustering key access
|
||||
return self.session.execute(
|
||||
self.get_sp_stmt,
|
||||
(collection, s, p, limit)
|
||||
)
|
||||
# Optimized: Use subject_table with clustering key access
|
||||
return self.session.execute(
|
||||
self.get_sp_stmt,
|
||||
(collection, s, p, limit)
|
||||
)
|
||||
|
||||
def get_po(self, collection, p, o, limit=10):
|
||||
if self.use_legacy:
|
||||
return self.session.execute(
|
||||
f"select s from {self.table} where collection = %s and p = %s and o = %s limit {limit} allow filtering",
|
||||
(collection, p, o)
|
||||
)
|
||||
else:
|
||||
# CRITICAL OPTIMIZATION: Use po_table - NO MORE ALLOW FILTERING!
|
||||
return self.session.execute(
|
||||
self.get_po_stmt,
|
||||
(collection, p, o, limit)
|
||||
)
|
||||
# CRITICAL OPTIMIZATION: Use po_table - NO MORE ALLOW FILTERING!
|
||||
return self.session.execute(
|
||||
self.get_po_stmt,
|
||||
(collection, p, o, limit)
|
||||
)
|
||||
|
||||
def get_os(self, collection, o, s, limit=10):
|
||||
if self.use_legacy:
|
||||
return self.session.execute(
|
||||
f"select p from {self.table} where collection = %s and o = %s and s = %s limit {limit} allow filtering",
|
||||
(collection, o, s)
|
||||
)
|
||||
else:
|
||||
# Optimized: Use subject_table with clustering access (no more ALLOW FILTERING)
|
||||
return self.session.execute(
|
||||
self.get_os_stmt,
|
||||
(collection, s, o, limit)
|
||||
)
|
||||
# Optimized: Use subject_table with clustering access (no more ALLOW FILTERING)
|
||||
return self.session.execute(
|
||||
self.get_os_stmt,
|
||||
(collection, s, o, limit)
|
||||
)
|
||||
|
||||
def get_spo(self, collection, s, p, o, limit=10):
|
||||
if self.use_legacy:
|
||||
return self.session.execute(
|
||||
f"""select s as x from {self.table} where collection = %s and s = %s and p = %s and o = %s limit {limit}""",
|
||||
(collection, s, p, o)
|
||||
# Optimized: Use collection_table for exact key lookup
|
||||
return self.session.execute(
|
||||
self.get_spo_stmt,
|
||||
(collection, s, p, o, limit)
|
||||
)
|
||||
|
||||
def collection_exists(self, collection):
|
||||
"""Check if collection exists by querying collection_metadata table"""
|
||||
try:
|
||||
result = self.session.execute(
|
||||
f"SELECT collection FROM {self.collection_metadata_table} WHERE collection = %s LIMIT 1",
|
||||
(collection,)
|
||||
)
|
||||
else:
|
||||
# Optimized: Use subject_table for exact key lookup
|
||||
return self.session.execute(
|
||||
self.get_spo_stmt,
|
||||
(collection, s, p, o, limit)
|
||||
return bool(list(result))
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking collection existence: {e}")
|
||||
return False
|
||||
|
||||
def create_collection(self, collection):
|
||||
"""Create collection by inserting metadata row"""
|
||||
try:
|
||||
import datetime
|
||||
self.session.execute(
|
||||
f"INSERT INTO {self.collection_metadata_table} (collection, created_at) VALUES (%s, %s)",
|
||||
(collection, datetime.datetime.now())
|
||||
)
|
||||
logger.info(f"Created collection metadata for {collection}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating collection: {e}")
|
||||
raise e
|
||||
|
||||
def delete_collection(self, collection):
|
||||
"""Delete all triples for a specific collection"""
|
||||
if self.use_legacy:
|
||||
self.session.execute(
|
||||
f"delete from {self.table} where collection = %s",
|
||||
(collection,)
|
||||
)
|
||||
else:
|
||||
# Delete from all three tables
|
||||
self.session.execute(
|
||||
f"delete from {self.subject_table} where collection = %s",
|
||||
(collection,)
|
||||
)
|
||||
self.session.execute(
|
||||
f"delete from {self.po_table} where collection = %s",
|
||||
(collection,)
|
||||
)
|
||||
self.session.execute(
|
||||
f"delete from {self.object_table} where collection = %s",
|
||||
(collection,)
|
||||
)
|
||||
"""Delete all triples for a specific collection
|
||||
|
||||
Uses collection_table to enumerate all triples, then deletes from all 4 tables
|
||||
using full partition keys for optimal performance with compound keys.
|
||||
"""
|
||||
# Step 1: Read all triples from collection_table (single partition read)
|
||||
rows = self.session.execute(
|
||||
f"SELECT s, p, o FROM {self.collection_table} WHERE collection = %s",
|
||||
(collection,)
|
||||
)
|
||||
|
||||
# Step 2: Delete each triple from all 4 tables using full partition keys
|
||||
# Batch deletions for efficiency
|
||||
batch = BatchStatement()
|
||||
count = 0
|
||||
|
||||
for row in rows:
|
||||
s, p, o = row.s, row.p, row.o
|
||||
|
||||
# Delete from subject table (partition key: collection, s)
|
||||
batch.add(SimpleStatement(
|
||||
f"DELETE FROM {self.subject_table} WHERE collection = ? AND s = ? AND p = ? AND o = ?"
|
||||
), (collection, s, p, o))
|
||||
|
||||
# Delete from predicate-object table (partition key: collection, p)
|
||||
batch.add(SimpleStatement(
|
||||
f"DELETE FROM {self.po_table} WHERE collection = ? AND p = ? AND o = ? AND s = ?"
|
||||
), (collection, p, o, s))
|
||||
|
||||
# Delete from object table (partition key: collection, o)
|
||||
batch.add(SimpleStatement(
|
||||
f"DELETE FROM {self.object_table} WHERE collection = ? AND o = ? AND s = ? AND p = ?"
|
||||
), (collection, o, s, p))
|
||||
|
||||
# Delete from collection table (partition key: collection only)
|
||||
batch.add(SimpleStatement(
|
||||
f"DELETE FROM {self.collection_table} WHERE collection = ? AND s = ? AND p = ? AND o = ?"
|
||||
), (collection, s, p, o))
|
||||
|
||||
count += 1
|
||||
|
||||
# Execute batch every 100 triples to avoid oversized batches
|
||||
if count % 100 == 0:
|
||||
self.session.execute(batch)
|
||||
batch = BatchStatement()
|
||||
|
||||
# Execute remaining deletions
|
||||
if count % 100 != 0:
|
||||
self.session.execute(batch)
|
||||
|
||||
# Step 3: Delete collection metadata
|
||||
self.session.execute(
|
||||
f"DELETE FROM {self.collection_metadata_table} WHERE collection = %s",
|
||||
(collection,)
|
||||
)
|
||||
|
||||
logger.info(f"Deleted {count} triples from collection {collection}")
|
||||
|
||||
def close(self):
|
||||
"""Close the Cassandra session and cluster connections properly"""
|
||||
|
|
|
|||
|
|
@ -49,6 +49,22 @@ class DocVectors:
|
|||
self.next_reload = time.time() + self.reload_time
|
||||
logger.debug(f"Reload at {self.next_reload}")
|
||||
|
||||
def collection_exists(self, user, collection):
|
||||
"""Check if collection exists (dimension-independent check)"""
|
||||
collection_name = make_safe_collection_name(user, collection, self.prefix)
|
||||
return self.client.has_collection(collection_name)
|
||||
|
||||
def create_collection(self, user, collection, dimension=384):
|
||||
"""Create collection with default dimension"""
|
||||
collection_name = make_safe_collection_name(user, collection, self.prefix)
|
||||
|
||||
if self.client.has_collection(collection_name):
|
||||
logger.info(f"Collection {collection_name} already exists")
|
||||
return
|
||||
|
||||
self.init_collection(dimension, user, collection)
|
||||
logger.info(f"Created Milvus collection {collection_name} with dimension {dimension}")
|
||||
|
||||
def init_collection(self, dimension, user, collection):
|
||||
|
||||
collection_name = make_safe_collection_name(user, collection, self.prefix)
|
||||
|
|
|
|||
|
|
@ -49,6 +49,22 @@ class EntityVectors:
|
|||
self.next_reload = time.time() + self.reload_time
|
||||
logger.debug(f"Reload at {self.next_reload}")
|
||||
|
||||
def collection_exists(self, user, collection):
|
||||
"""Check if collection exists (dimension-independent check)"""
|
||||
collection_name = make_safe_collection_name(user, collection, self.prefix)
|
||||
return self.client.has_collection(collection_name)
|
||||
|
||||
def create_collection(self, user, collection, dimension=384):
|
||||
"""Create collection with default dimension"""
|
||||
collection_name = make_safe_collection_name(user, collection, self.prefix)
|
||||
|
||||
if self.client.has_collection(collection_name):
|
||||
logger.info(f"Collection {collection_name} already exists")
|
||||
return
|
||||
|
||||
self.init_collection(dimension, user, collection)
|
||||
logger.info(f"Created Milvus collection {collection_name} with dimension {dimension}")
|
||||
|
||||
def init_collection(self, dimension, user, collection):
|
||||
|
||||
collection_name = make_safe_collection_name(user, collection, self.prefix)
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ class CollectionManager:
|
|||
|
||||
async def ensure_collection_exists(self, user: str, collection: str):
|
||||
"""
|
||||
Ensure a collection exists, creating it if necessary (lazy creation)
|
||||
Ensure a collection exists, creating it if necessary with broadcast to storage
|
||||
|
||||
Args:
|
||||
user: User ID
|
||||
|
|
@ -74,7 +74,7 @@ class CollectionManager:
|
|||
return
|
||||
|
||||
# Create new collection with default metadata
|
||||
logger.info(f"Creating new collection {user}/{collection}")
|
||||
logger.info(f"Auto-creating collection {user}/{collection} from document submission")
|
||||
await self.table_store.create_collection(
|
||||
user=user,
|
||||
collection=collection,
|
||||
|
|
@ -83,10 +83,64 @@ class CollectionManager:
|
|||
tags=set()
|
||||
)
|
||||
|
||||
# Broadcast collection creation to all storage backends
|
||||
creation_key = (user, collection)
|
||||
logger.info(f"Broadcasting create-collection for {creation_key}")
|
||||
|
||||
self.pending_deletions[creation_key] = {
|
||||
"responses_pending": 3, # vector, object, triples
|
||||
"responses_received": [],
|
||||
"all_successful": True,
|
||||
"error_messages": [],
|
||||
"deletion_complete": asyncio.Event()
|
||||
}
|
||||
|
||||
storage_request = StorageManagementRequest(
|
||||
operation="create-collection",
|
||||
user=user,
|
||||
collection=collection
|
||||
)
|
||||
|
||||
# Send creation requests to all storage types
|
||||
if self.vector_storage_producer:
|
||||
await self.vector_storage_producer.send(storage_request)
|
||||
if self.object_storage_producer:
|
||||
await self.object_storage_producer.send(storage_request)
|
||||
if self.triples_storage_producer:
|
||||
await self.triples_storage_producer.send(storage_request)
|
||||
|
||||
# Wait for all storage creations to complete (with timeout)
|
||||
creation_info = self.pending_deletions[creation_key]
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
creation_info["deletion_complete"].wait(),
|
||||
timeout=30.0 # 30 second timeout
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.error(f"Timeout waiting for storage creation responses for {creation_key}")
|
||||
creation_info["all_successful"] = False
|
||||
creation_info["error_messages"].append("Timeout waiting for storage creation")
|
||||
|
||||
# Check if all creations succeeded
|
||||
if not creation_info["all_successful"]:
|
||||
error_msg = f"Storage creation failed: {'; '.join(creation_info['error_messages'])}"
|
||||
logger.error(error_msg)
|
||||
|
||||
# Clean up metadata on failure
|
||||
await self.table_store.delete_collection(user, collection)
|
||||
|
||||
# Clean up tracking
|
||||
del self.pending_deletions[creation_key]
|
||||
|
||||
raise RuntimeError(error_msg)
|
||||
|
||||
# Clean up tracking
|
||||
del self.pending_deletions[creation_key]
|
||||
logger.info(f"Collection {creation_key} auto-created successfully in all storage backends")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error ensuring collection exists: {e}")
|
||||
# Don't fail the operation if collection creation fails
|
||||
# This maintains backward compatibility
|
||||
raise e
|
||||
|
||||
async def list_collections(self, request: CollectionManagementRequest) -> CollectionManagementResponse:
|
||||
"""
|
||||
|
|
@ -154,6 +208,67 @@ class CollectionManager:
|
|||
tags=tags
|
||||
)
|
||||
|
||||
# Broadcast collection creation to all storage backends
|
||||
creation_key = (request.user, request.collection)
|
||||
logger.info(f"Broadcasting create-collection for {creation_key}")
|
||||
|
||||
self.pending_deletions[creation_key] = {
|
||||
"responses_pending": 3, # vector, object, triples
|
||||
"responses_received": [],
|
||||
"all_successful": True,
|
||||
"error_messages": [],
|
||||
"deletion_complete": asyncio.Event()
|
||||
}
|
||||
|
||||
storage_request = StorageManagementRequest(
|
||||
operation="create-collection",
|
||||
user=request.user,
|
||||
collection=request.collection
|
||||
)
|
||||
|
||||
# Send creation requests to all storage types
|
||||
if self.vector_storage_producer:
|
||||
await self.vector_storage_producer.send(storage_request)
|
||||
if self.object_storage_producer:
|
||||
await self.object_storage_producer.send(storage_request)
|
||||
if self.triples_storage_producer:
|
||||
await self.triples_storage_producer.send(storage_request)
|
||||
|
||||
# Wait for all storage creations to complete (with timeout)
|
||||
creation_info = self.pending_deletions[creation_key]
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
creation_info["deletion_complete"].wait(),
|
||||
timeout=30.0 # 30 second timeout
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.error(f"Timeout waiting for storage creation responses for {creation_key}")
|
||||
creation_info["all_successful"] = False
|
||||
creation_info["error_messages"].append("Timeout waiting for storage creation")
|
||||
|
||||
# Check if all creations succeeded
|
||||
if not creation_info["all_successful"]:
|
||||
error_msg = f"Storage creation failed: {'; '.join(creation_info['error_messages'])}"
|
||||
logger.error(error_msg)
|
||||
|
||||
# Clean up metadata on failure
|
||||
await self.table_store.delete_collection(request.user, request.collection)
|
||||
|
||||
# Clean up tracking
|
||||
del self.pending_deletions[creation_key]
|
||||
|
||||
return CollectionManagementResponse(
|
||||
error=Error(
|
||||
type="storage_creation_error",
|
||||
message=error_msg
|
||||
),
|
||||
timestamp=datetime.now().isoformat()
|
||||
)
|
||||
|
||||
# Clean up tracking
|
||||
del self.pending_deletions[creation_key]
|
||||
logger.info(f"Collection {creation_key} created successfully in all storage backends")
|
||||
|
||||
# Get the newly created collection for response
|
||||
created_collection = await self.table_store.get_collection(request.user, request.collection)
|
||||
|
||||
|
|
|
|||
|
|
@ -38,24 +38,10 @@ class Processor(DocumentEmbeddingsQueryService):
|
|||
)
|
||||
|
||||
self.qdrant = QdrantClient(url=store_uri, api_key=api_key)
|
||||
self.last_collection = None
|
||||
|
||||
def ensure_collection_exists(self, collection, dim):
|
||||
"""Ensure collection exists, create if it doesn't"""
|
||||
if collection != self.last_collection:
|
||||
if not self.qdrant.collection_exists(collection):
|
||||
try:
|
||||
self.qdrant.create_collection(
|
||||
collection_name=collection,
|
||||
vectors_config=VectorParams(
|
||||
size=dim, distance=Distance.COSINE
|
||||
),
|
||||
)
|
||||
logger.info(f"Created collection: {collection}")
|
||||
except Exception as e:
|
||||
logger.error(f"Qdrant collection creation failed: {e}")
|
||||
raise e
|
||||
self.last_collection = collection
|
||||
def collection_exists(self, collection):
|
||||
"""Check if collection exists (no implicit creation)"""
|
||||
return self.qdrant.collection_exists(collection)
|
||||
|
||||
async def query_document_embeddings(self, msg):
|
||||
|
||||
|
|
@ -63,15 +49,16 @@ class Processor(DocumentEmbeddingsQueryService):
|
|||
|
||||
chunks = []
|
||||
|
||||
collection = (
|
||||
"d_" + msg.user + "_" + msg.collection
|
||||
)
|
||||
|
||||
# Check if collection exists - return empty if not
|
||||
if not self.collection_exists(collection):
|
||||
logger.info(f"Collection {collection} does not exist, returning empty results")
|
||||
return []
|
||||
|
||||
for vec in msg.vectors:
|
||||
|
||||
dim = len(vec)
|
||||
collection = (
|
||||
"d_" + msg.user + "_" + msg.collection
|
||||
)
|
||||
|
||||
self.ensure_collection_exists(collection, dim)
|
||||
|
||||
search_result = self.qdrant.query_points(
|
||||
collection_name=collection,
|
||||
query=vec,
|
||||
|
|
|
|||
|
|
@ -38,24 +38,10 @@ class Processor(GraphEmbeddingsQueryService):
|
|||
)
|
||||
|
||||
self.qdrant = QdrantClient(url=store_uri, api_key=api_key)
|
||||
self.last_collection = None
|
||||
|
||||
def ensure_collection_exists(self, collection, dim):
|
||||
"""Ensure collection exists, create if it doesn't"""
|
||||
if collection != self.last_collection:
|
||||
if not self.qdrant.collection_exists(collection):
|
||||
try:
|
||||
self.qdrant.create_collection(
|
||||
collection_name=collection,
|
||||
vectors_config=VectorParams(
|
||||
size=dim, distance=Distance.COSINE
|
||||
),
|
||||
)
|
||||
logger.info(f"Created collection: {collection}")
|
||||
except Exception as e:
|
||||
logger.error(f"Qdrant collection creation failed: {e}")
|
||||
raise e
|
||||
self.last_collection = collection
|
||||
def collection_exists(self, collection):
|
||||
"""Check if collection exists (no implicit creation)"""
|
||||
return self.qdrant.collection_exists(collection)
|
||||
|
||||
def create_value(self, ent):
|
||||
if ent.startswith("http://") or ent.startswith("https://"):
|
||||
|
|
@ -70,15 +56,17 @@ class Processor(GraphEmbeddingsQueryService):
|
|||
entity_set = set()
|
||||
entities = []
|
||||
|
||||
collection = (
|
||||
"t_" + msg.user + "_" + msg.collection
|
||||
)
|
||||
|
||||
# Check if collection exists - return empty if not
|
||||
if not self.collection_exists(collection):
|
||||
logger.info(f"Collection {collection} does not exist, returning empty results")
|
||||
return []
|
||||
|
||||
for vec in msg.vectors:
|
||||
|
||||
dim = len(vec)
|
||||
collection = (
|
||||
"t_" + msg.user + "_" + msg.collection
|
||||
)
|
||||
|
||||
self.ensure_collection_exists(collection, dim)
|
||||
|
||||
# Heuristic hack, get (2*limit), so that we have more chance
|
||||
# of getting (limit) entities
|
||||
search_result = self.qdrant.query_points(
|
||||
|
|
|
|||
|
|
@ -60,19 +60,34 @@ class Processor(DocumentEmbeddingsStoreService):
|
|||
metrics=storage_response_metrics,
|
||||
)
|
||||
|
||||
async def start(self):
|
||||
"""Start the processor and its storage management consumer"""
|
||||
await super().start()
|
||||
await self.storage_request_consumer.start()
|
||||
await self.storage_response_producer.start()
|
||||
|
||||
async def store_document_embeddings(self, message):
|
||||
|
||||
# Validate collection exists before accepting writes
|
||||
if not self.vecstore.collection_exists(message.metadata.user, message.metadata.collection):
|
||||
error_msg = (
|
||||
f"Collection {message.metadata.collection} does not exist. "
|
||||
f"Create it first with tg-set-collection."
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
for emb in message.chunks:
|
||||
|
||||
if emb.chunk is None or emb.chunk == b"": continue
|
||||
|
||||
|
||||
chunk = emb.chunk.decode("utf-8")
|
||||
if chunk == "": continue
|
||||
|
||||
for vec in emb.vectors:
|
||||
self.vecstore.insert(
|
||||
vec, chunk,
|
||||
message.metadata.user,
|
||||
vec, chunk,
|
||||
message.metadata.user,
|
||||
message.metadata.collection
|
||||
)
|
||||
|
||||
|
|
@ -87,18 +102,21 @@ class Processor(DocumentEmbeddingsStoreService):
|
|||
help=f'Milvus store URI (default: {default_store_uri})'
|
||||
)
|
||||
|
||||
async def on_storage_management(self, message):
|
||||
async def on_storage_management(self, message, consumer, flow):
|
||||
"""Handle storage management requests"""
|
||||
logger.info(f"Storage management request: {message.operation} for {message.user}/{message.collection}")
|
||||
request = message.value()
|
||||
logger.info(f"Storage management request: {request.operation} for {request.user}/{request.collection}")
|
||||
|
||||
try:
|
||||
if message.operation == "delete-collection":
|
||||
await self.handle_delete_collection(message)
|
||||
if request.operation == "create-collection":
|
||||
await self.handle_create_collection(request)
|
||||
elif request.operation == "delete-collection":
|
||||
await self.handle_delete_collection(request)
|
||||
else:
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="invalid_operation",
|
||||
message=f"Unknown operation: {message.operation}"
|
||||
message=f"Unknown operation: {request.operation}"
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
|
@ -113,17 +131,40 @@ class Processor(DocumentEmbeddingsStoreService):
|
|||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, message):
|
||||
async def handle_create_collection(self, request):
|
||||
"""Create a Milvus collection for document embeddings"""
|
||||
try:
|
||||
if self.vecstore.collection_exists(request.user, request.collection):
|
||||
logger.info(f"Collection {request.user}/{request.collection} already exists")
|
||||
else:
|
||||
self.vecstore.create_collection(request.user, request.collection)
|
||||
logger.info(f"Created collection {request.user}/{request.collection}")
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(error=None)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create collection: {e}", exc_info=True)
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="creation_error",
|
||||
message=str(e)
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, request):
|
||||
"""Delete the collection for document embeddings"""
|
||||
try:
|
||||
self.vecstore.delete_collection(message.user, message.collection)
|
||||
self.vecstore.delete_collection(request.user, request.collection)
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(
|
||||
error=None # No error means success
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
logger.info(f"Successfully deleted collection {message.user}/{message.collection}")
|
||||
logger.info(f"Successfully deleted collection {request.user}/{request.collection}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete collection: {e}")
|
||||
|
|
|
|||
|
|
@ -115,38 +115,36 @@ class Processor(DocumentEmbeddingsStoreService):
|
|||
"Gave up waiting for index creation"
|
||||
)
|
||||
|
||||
async def start(self):
|
||||
"""Start the processor and its storage management consumer"""
|
||||
await super().start()
|
||||
await self.storage_request_consumer.start()
|
||||
await self.storage_response_producer.start()
|
||||
|
||||
async def store_document_embeddings(self, message):
|
||||
|
||||
index_name = (
|
||||
"d-" + message.metadata.user + "-" + message.metadata.collection
|
||||
)
|
||||
|
||||
# Validate collection exists before accepting writes
|
||||
if not self.pinecone.has_index(index_name):
|
||||
error_msg = (
|
||||
f"Collection {message.metadata.collection} does not exist. "
|
||||
f"Create it first with tg-set-collection."
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
for emb in message.chunks:
|
||||
|
||||
if emb.chunk is None or emb.chunk == b"": continue
|
||||
|
||||
|
||||
chunk = emb.chunk.decode("utf-8")
|
||||
if chunk == "": continue
|
||||
|
||||
for vec in emb.vectors:
|
||||
|
||||
dim = len(vec)
|
||||
index_name = (
|
||||
"d-" + message.metadata.user + "-" + message.metadata.collection
|
||||
)
|
||||
|
||||
if index_name != self.last_index_name:
|
||||
|
||||
if not self.pinecone.has_index(index_name):
|
||||
|
||||
try:
|
||||
|
||||
self.create_index(index_name, dim)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Pinecone index creation failed")
|
||||
raise e
|
||||
|
||||
logger.info(f"Index {index_name} created")
|
||||
|
||||
self.last_index_name = index_name
|
||||
|
||||
index = self.pinecone.Index(index_name)
|
||||
|
||||
# Generate unique ID for each vector
|
||||
|
|
@ -192,18 +190,21 @@ class Processor(DocumentEmbeddingsStoreService):
|
|||
help=f'Pinecone region, (default: {default_region}'
|
||||
)
|
||||
|
||||
async def on_storage_management(self, message):
|
||||
async def on_storage_management(self, message, consumer, flow):
|
||||
"""Handle storage management requests"""
|
||||
logger.info(f"Storage management request: {message.operation} for {message.user}/{message.collection}")
|
||||
request = message.value()
|
||||
logger.info(f"Storage management request: {request.operation} for {request.user}/{request.collection}")
|
||||
|
||||
try:
|
||||
if message.operation == "delete-collection":
|
||||
await self.handle_delete_collection(message)
|
||||
if request.operation == "create-collection":
|
||||
await self.handle_create_collection(request)
|
||||
elif request.operation == "delete-collection":
|
||||
await self.handle_delete_collection(request)
|
||||
else:
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="invalid_operation",
|
||||
message=f"Unknown operation: {message.operation}"
|
||||
message=f"Unknown operation: {request.operation}"
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
|
@ -218,10 +219,36 @@ class Processor(DocumentEmbeddingsStoreService):
|
|||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, message):
|
||||
async def handle_create_collection(self, request):
|
||||
"""Create a Pinecone index for document embeddings"""
|
||||
try:
|
||||
index_name = f"d-{request.user}-{request.collection}"
|
||||
|
||||
if self.pinecone.has_index(index_name):
|
||||
logger.info(f"Pinecone index {index_name} already exists")
|
||||
else:
|
||||
# Create with default dimension - will need to be recreated if dimension doesn't match
|
||||
self.create_index(index_name, dim=384)
|
||||
logger.info(f"Created Pinecone index: {index_name}")
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(error=None)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create collection: {e}", exc_info=True)
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="creation_error",
|
||||
message=str(e)
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, request):
|
||||
"""Delete the collection for document embeddings"""
|
||||
try:
|
||||
index_name = f"d-{message.user}-{message.collection}"
|
||||
index_name = f"d-{request.user}-{request.collection}"
|
||||
|
||||
if self.pinecone.has_index(index_name):
|
||||
self.pinecone.delete_index(index_name)
|
||||
|
|
@ -234,7 +261,7 @@ class Processor(DocumentEmbeddingsStoreService):
|
|||
error=None # No error means success
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
logger.info(f"Successfully deleted collection {message.user}/{message.collection}")
|
||||
logger.info(f"Successfully deleted collection {request.user}/{request.collection}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete collection: {e}")
|
||||
|
|
|
|||
|
|
@ -36,8 +36,6 @@ class Processor(DocumentEmbeddingsStoreService):
|
|||
}
|
||||
)
|
||||
|
||||
self.last_collection = None
|
||||
|
||||
self.qdrant = QdrantClient(url=store_uri, api_key=api_key)
|
||||
|
||||
# Set up storage management if base class attributes are available
|
||||
|
|
@ -71,8 +69,30 @@ class Processor(DocumentEmbeddingsStoreService):
|
|||
metrics=storage_response_metrics,
|
||||
)
|
||||
|
||||
async def start(self):
|
||||
"""Start the processor and its storage management consumer"""
|
||||
await super().start()
|
||||
if hasattr(self, 'storage_request_consumer'):
|
||||
await self.storage_request_consumer.start()
|
||||
if hasattr(self, 'storage_response_producer'):
|
||||
await self.storage_response_producer.start()
|
||||
|
||||
async def store_document_embeddings(self, message):
|
||||
|
||||
# Validate collection exists before accepting writes
|
||||
collection = (
|
||||
"d_" + message.metadata.user + "_" +
|
||||
message.metadata.collection
|
||||
)
|
||||
|
||||
if not self.qdrant.collection_exists(collection):
|
||||
error_msg = (
|
||||
f"Collection {message.metadata.collection} does not exist. "
|
||||
f"Create it first with tg-set-collection."
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
for emb in message.chunks:
|
||||
|
||||
chunk = emb.chunk.decode("utf-8")
|
||||
|
|
@ -80,29 +100,6 @@ class Processor(DocumentEmbeddingsStoreService):
|
|||
|
||||
for vec in emb.vectors:
|
||||
|
||||
dim = len(vec)
|
||||
collection = (
|
||||
"d_" + message.metadata.user + "_" +
|
||||
message.metadata.collection
|
||||
)
|
||||
|
||||
if collection != self.last_collection:
|
||||
|
||||
if not self.qdrant.collection_exists(collection):
|
||||
|
||||
try:
|
||||
self.qdrant.create_collection(
|
||||
collection_name=collection,
|
||||
vectors_config=VectorParams(
|
||||
size=dim, distance=Distance.COSINE
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Qdrant collection creation failed")
|
||||
raise e
|
||||
|
||||
self.last_collection = collection
|
||||
|
||||
self.qdrant.upsert(
|
||||
collection_name=collection,
|
||||
points=[
|
||||
|
|
@ -133,18 +130,21 @@ class Processor(DocumentEmbeddingsStoreService):
|
|||
help=f'Qdrant API key (default: None)'
|
||||
)
|
||||
|
||||
async def on_storage_management(self, message):
|
||||
async def on_storage_management(self, message, consumer, flow):
|
||||
"""Handle storage management requests"""
|
||||
logger.info(f"Storage management request: {message.operation} for {message.user}/{message.collection}")
|
||||
request = message.value()
|
||||
logger.info(f"Storage management request: {request.operation} for {request.user}/{request.collection}")
|
||||
|
||||
try:
|
||||
if message.operation == "delete-collection":
|
||||
await self.handle_delete_collection(message)
|
||||
if request.operation == "create-collection":
|
||||
await self.handle_create_collection(request)
|
||||
elif request.operation == "delete-collection":
|
||||
await self.handle_delete_collection(request)
|
||||
else:
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="invalid_operation",
|
||||
message=f"Unknown operation: {message.operation}"
|
||||
message=f"Unknown operation: {request.operation}"
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
|
@ -159,10 +159,43 @@ class Processor(DocumentEmbeddingsStoreService):
|
|||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, message):
|
||||
async def handle_create_collection(self, request):
|
||||
"""Create a Qdrant collection for document embeddings"""
|
||||
try:
|
||||
collection_name = f"d_{request.user}_{request.collection}"
|
||||
|
||||
if self.qdrant.collection_exists(collection_name):
|
||||
logger.info(f"Qdrant collection {collection_name} already exists")
|
||||
else:
|
||||
# Create collection with default dimension (will be recreated with correct dim on first write if needed)
|
||||
# Using a placeholder dimension - actual dimension determined by first embedding
|
||||
self.qdrant.create_collection(
|
||||
collection_name=collection_name,
|
||||
vectors_config=VectorParams(
|
||||
size=384, # Default dimension, common for many models
|
||||
distance=Distance.COSINE
|
||||
)
|
||||
)
|
||||
logger.info(f"Created Qdrant collection: {collection_name}")
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(error=None)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create collection: {e}", exc_info=True)
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="creation_error",
|
||||
message=str(e)
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, request):
|
||||
"""Delete the collection for document embeddings"""
|
||||
try:
|
||||
collection_name = f"d_{message.user}_{message.collection}"
|
||||
collection_name = f"d_{request.user}_{request.collection}"
|
||||
|
||||
if self.qdrant.collection_exists(collection_name):
|
||||
self.qdrant.delete_collection(collection_name)
|
||||
|
|
@ -175,7 +208,7 @@ class Processor(DocumentEmbeddingsStoreService):
|
|||
error=None # No error means success
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
logger.info(f"Successfully deleted collection {message.user}/{message.collection}")
|
||||
logger.info(f"Successfully deleted collection {request.user}/{request.collection}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete collection: {e}")
|
||||
|
|
|
|||
|
|
@ -60,8 +60,23 @@ class Processor(GraphEmbeddingsStoreService):
|
|||
metrics=storage_response_metrics,
|
||||
)
|
||||
|
||||
async def start(self):
|
||||
"""Start the processor and its storage management consumer"""
|
||||
await super().start()
|
||||
await self.storage_request_consumer.start()
|
||||
await self.storage_response_producer.start()
|
||||
|
||||
async def store_graph_embeddings(self, message):
|
||||
|
||||
# Validate collection exists before accepting writes
|
||||
if not self.vecstore.collection_exists(message.metadata.user, message.metadata.collection):
|
||||
error_msg = (
|
||||
f"Collection {message.metadata.collection} does not exist. "
|
||||
f"Create it first with tg-set-collection."
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
for entity in message.entities:
|
||||
|
||||
if entity.entity.value != "" and entity.entity.value is not None:
|
||||
|
|
@ -83,18 +98,21 @@ class Processor(GraphEmbeddingsStoreService):
|
|||
help=f'Milvus store URI (default: {default_store_uri})'
|
||||
)
|
||||
|
||||
async def on_storage_management(self, message):
|
||||
async def on_storage_management(self, message, consumer, flow):
|
||||
"""Handle storage management requests"""
|
||||
logger.info(f"Storage management request: {message.operation} for {message.user}/{message.collection}")
|
||||
request = message.value()
|
||||
logger.info(f"Storage management request: {request.operation} for {request.user}/{request.collection}")
|
||||
|
||||
try:
|
||||
if message.operation == "delete-collection":
|
||||
await self.handle_delete_collection(message)
|
||||
if request.operation == "create-collection":
|
||||
await self.handle_create_collection(request)
|
||||
elif request.operation == "delete-collection":
|
||||
await self.handle_delete_collection(request)
|
||||
else:
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="invalid_operation",
|
||||
message=f"Unknown operation: {message.operation}"
|
||||
message=f"Unknown operation: {request.operation}"
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
|
@ -109,17 +127,40 @@ class Processor(GraphEmbeddingsStoreService):
|
|||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, message):
|
||||
async def handle_create_collection(self, request):
|
||||
"""Create a Milvus collection for graph embeddings"""
|
||||
try:
|
||||
if self.vecstore.collection_exists(request.user, request.collection):
|
||||
logger.info(f"Collection {request.user}/{request.collection} already exists")
|
||||
else:
|
||||
self.vecstore.create_collection(request.user, request.collection)
|
||||
logger.info(f"Created collection {request.user}/{request.collection}")
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(error=None)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create collection: {e}", exc_info=True)
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="creation_error",
|
||||
message=str(e)
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, request):
|
||||
"""Delete the collection for graph embeddings"""
|
||||
try:
|
||||
self.vecstore.delete_collection(message.user, message.collection)
|
||||
self.vecstore.delete_collection(request.user, request.collection)
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(
|
||||
error=None # No error means success
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
logger.info(f"Successfully deleted collection {message.user}/{message.collection}")
|
||||
logger.info(f"Successfully deleted collection {request.user}/{request.collection}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete collection: {e}")
|
||||
|
|
|
|||
|
|
@ -115,8 +115,27 @@ class Processor(GraphEmbeddingsStoreService):
|
|||
"Gave up waiting for index creation"
|
||||
)
|
||||
|
||||
async def start(self):
|
||||
"""Start the processor and its storage management consumer"""
|
||||
await super().start()
|
||||
await self.storage_request_consumer.start()
|
||||
await self.storage_response_producer.start()
|
||||
|
||||
async def store_graph_embeddings(self, message):
|
||||
|
||||
index_name = (
|
||||
"t-" + message.metadata.user + "-" + message.metadata.collection
|
||||
)
|
||||
|
||||
# Validate collection exists before accepting writes
|
||||
if not self.pinecone.has_index(index_name):
|
||||
error_msg = (
|
||||
f"Collection {message.metadata.collection} does not exist. "
|
||||
f"Create it first with tg-set-collection."
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
for entity in message.entities:
|
||||
|
||||
if entity.entity.value == "" or entity.entity.value is None:
|
||||
|
|
@ -124,28 +143,6 @@ class Processor(GraphEmbeddingsStoreService):
|
|||
|
||||
for vec in entity.vectors:
|
||||
|
||||
dim = len(vec)
|
||||
|
||||
index_name = (
|
||||
"t-" + message.metadata.user + "-" + message.metadata.collection
|
||||
)
|
||||
|
||||
if index_name != self.last_index_name:
|
||||
|
||||
if not self.pinecone.has_index(index_name):
|
||||
|
||||
try:
|
||||
|
||||
self.create_index(index_name, dim)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Pinecone index creation failed")
|
||||
raise e
|
||||
|
||||
logger.info(f"Index {index_name} created")
|
||||
|
||||
self.last_index_name = index_name
|
||||
|
||||
index = self.pinecone.Index(index_name)
|
||||
|
||||
# Generate unique ID for each vector
|
||||
|
|
@ -191,18 +188,21 @@ class Processor(GraphEmbeddingsStoreService):
|
|||
help=f'Pinecone region, (default: {default_region}'
|
||||
)
|
||||
|
||||
async def on_storage_management(self, message):
|
||||
async def on_storage_management(self, message, consumer, flow):
|
||||
"""Handle storage management requests"""
|
||||
logger.info(f"Storage management request: {message.operation} for {message.user}/{message.collection}")
|
||||
request = message.value()
|
||||
logger.info(f"Storage management request: {request.operation} for {request.user}/{request.collection}")
|
||||
|
||||
try:
|
||||
if message.operation == "delete-collection":
|
||||
await self.handle_delete_collection(message)
|
||||
if request.operation == "create-collection":
|
||||
await self.handle_create_collection(request)
|
||||
elif request.operation == "delete-collection":
|
||||
await self.handle_delete_collection(request)
|
||||
else:
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="invalid_operation",
|
||||
message=f"Unknown operation: {message.operation}"
|
||||
message=f"Unknown operation: {request.operation}"
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
|
@ -217,10 +217,36 @@ class Processor(GraphEmbeddingsStoreService):
|
|||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, message):
|
||||
async def handle_create_collection(self, request):
|
||||
"""Create a Pinecone index for graph embeddings"""
|
||||
try:
|
||||
index_name = f"t-{request.user}-{request.collection}"
|
||||
|
||||
if self.pinecone.has_index(index_name):
|
||||
logger.info(f"Pinecone index {index_name} already exists")
|
||||
else:
|
||||
# Create with default dimension - will need to be recreated if dimension doesn't match
|
||||
self.create_index(index_name, dim=384)
|
||||
logger.info(f"Created Pinecone index: {index_name}")
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(error=None)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create collection: {e}", exc_info=True)
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="creation_error",
|
||||
message=str(e)
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, request):
|
||||
"""Delete the collection for graph embeddings"""
|
||||
try:
|
||||
index_name = f"t-{message.user}-{message.collection}"
|
||||
index_name = f"t-{request.user}-{request.collection}"
|
||||
|
||||
if self.pinecone.has_index(index_name):
|
||||
self.pinecone.delete_index(index_name)
|
||||
|
|
@ -233,7 +259,7 @@ class Processor(GraphEmbeddingsStoreService):
|
|||
error=None # No error means success
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
logger.info(f"Successfully deleted collection {message.user}/{message.collection}")
|
||||
logger.info(f"Successfully deleted collection {request.user}/{request.collection}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete collection: {e}")
|
||||
|
|
|
|||
|
|
@ -36,8 +36,6 @@ class Processor(GraphEmbeddingsStoreService):
|
|||
}
|
||||
)
|
||||
|
||||
self.last_collection = None
|
||||
|
||||
self.qdrant = QdrantClient(url=store_uri, api_key=api_key)
|
||||
|
||||
# Set up storage management if base class attributes are available
|
||||
|
|
@ -71,31 +69,30 @@ class Processor(GraphEmbeddingsStoreService):
|
|||
metrics=storage_response_metrics,
|
||||
)
|
||||
|
||||
def get_collection(self, dim, user, collection):
|
||||
|
||||
def get_collection(self, user, collection):
|
||||
"""Get collection name and validate it exists"""
|
||||
cname = (
|
||||
"t_" + user + "_" + collection
|
||||
)
|
||||
|
||||
if cname != self.last_collection:
|
||||
|
||||
if not self.qdrant.collection_exists(cname):
|
||||
|
||||
try:
|
||||
self.qdrant.create_collection(
|
||||
collection_name=cname,
|
||||
vectors_config=VectorParams(
|
||||
size=dim, distance=Distance.COSINE
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Qdrant collection creation failed")
|
||||
raise e
|
||||
|
||||
self.last_collection = cname
|
||||
if not self.qdrant.collection_exists(cname):
|
||||
error_msg = (
|
||||
f"Collection {collection} does not exist. "
|
||||
f"Create it first with tg-set-collection."
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
return cname
|
||||
|
||||
async def start(self):
|
||||
"""Start the processor and its storage management consumer"""
|
||||
await super().start()
|
||||
if hasattr(self, 'storage_request_consumer'):
|
||||
await self.storage_request_consumer.start()
|
||||
if hasattr(self, 'storage_response_producer'):
|
||||
await self.storage_response_producer.start()
|
||||
|
||||
async def store_graph_embeddings(self, message):
|
||||
|
||||
for entity in message.entities:
|
||||
|
|
@ -104,10 +101,8 @@ class Processor(GraphEmbeddingsStoreService):
|
|||
|
||||
for vec in entity.vectors:
|
||||
|
||||
dim = len(vec)
|
||||
|
||||
collection = self.get_collection(
|
||||
dim, message.metadata.user, message.metadata.collection
|
||||
message.metadata.user, message.metadata.collection
|
||||
)
|
||||
|
||||
self.qdrant.upsert(
|
||||
|
|
@ -140,18 +135,21 @@ class Processor(GraphEmbeddingsStoreService):
|
|||
help=f'Qdrant API key'
|
||||
)
|
||||
|
||||
async def on_storage_management(self, message):
|
||||
async def on_storage_management(self, message, consumer, flow):
|
||||
"""Handle storage management requests"""
|
||||
logger.info(f"Storage management request: {message.operation} for {message.user}/{message.collection}")
|
||||
request = message.value()
|
||||
logger.info(f"Storage management request: {request.operation} for {request.user}/{request.collection}")
|
||||
|
||||
try:
|
||||
if message.operation == "delete-collection":
|
||||
await self.handle_delete_collection(message)
|
||||
if request.operation == "create-collection":
|
||||
await self.handle_create_collection(request)
|
||||
elif request.operation == "delete-collection":
|
||||
await self.handle_delete_collection(request)
|
||||
else:
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="invalid_operation",
|
||||
message=f"Unknown operation: {message.operation}"
|
||||
message=f"Unknown operation: {request.operation}"
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
|
@ -166,10 +164,43 @@ class Processor(GraphEmbeddingsStoreService):
|
|||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, message):
|
||||
async def handle_create_collection(self, request):
|
||||
"""Create a Qdrant collection for graph embeddings"""
|
||||
try:
|
||||
collection_name = f"t_{request.user}_{request.collection}"
|
||||
|
||||
if self.qdrant.collection_exists(collection_name):
|
||||
logger.info(f"Qdrant collection {collection_name} already exists")
|
||||
else:
|
||||
# Create collection with default dimension (will be recreated with correct dim on first write if needed)
|
||||
# Using a placeholder dimension - actual dimension determined by first embedding
|
||||
self.qdrant.create_collection(
|
||||
collection_name=collection_name,
|
||||
vectors_config=VectorParams(
|
||||
size=384, # Default dimension, common for many models
|
||||
distance=Distance.COSINE
|
||||
)
|
||||
)
|
||||
logger.info(f"Created Qdrant collection: {collection_name}")
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(error=None)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create collection: {e}", exc_info=True)
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="creation_error",
|
||||
message=str(e)
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, request):
|
||||
"""Delete the collection for graph embeddings"""
|
||||
try:
|
||||
collection_name = f"t_{message.user}_{message.collection}"
|
||||
collection_name = f"t_{request.user}_{request.collection}"
|
||||
|
||||
if self.qdrant.collection_exists(collection_name):
|
||||
self.qdrant.delete_collection(collection_name)
|
||||
|
|
@ -182,7 +213,7 @@ class Processor(GraphEmbeddingsStoreService):
|
|||
error=None # No error means success
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
logger.info(f"Successfully deleted collection {message.user}/{message.collection}")
|
||||
logger.info(f"Successfully deleted collection {request.user}/{request.collection}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete collection: {e}")
|
||||
|
|
|
|||
|
|
@ -295,6 +295,8 @@ class Processor(FlowProcessor):
|
|||
|
||||
try:
|
||||
self.session.execute(create_table_cql)
|
||||
if keyspace not in self.known_tables:
|
||||
self.known_tables[keyspace] = set()
|
||||
self.known_tables[keyspace].add(table_key)
|
||||
logger.info(f"Ensured table exists: {safe_keyspace}.{safe_table}")
|
||||
|
||||
|
|
@ -340,18 +342,47 @@ class Processor(FlowProcessor):
|
|||
logger.warning(f"Failed to convert value {value} to type {field_type}: {e}")
|
||||
return str(value)
|
||||
|
||||
async def start(self):
|
||||
"""Start the processor and its storage management consumer"""
|
||||
await super().start()
|
||||
await self.storage_request_consumer.start()
|
||||
await self.storage_response_producer.start()
|
||||
|
||||
async def on_object(self, msg, consumer, flow):
|
||||
"""Process incoming ExtractedObject and store in Cassandra"""
|
||||
|
||||
|
||||
obj = msg.value()
|
||||
logger.info(f"Storing {len(obj.values)} objects for schema {obj.schema_name} from {obj.metadata.id}")
|
||||
|
||||
|
||||
# Validate collection/keyspace exists before accepting writes
|
||||
safe_keyspace = self.sanitize_name(obj.metadata.user)
|
||||
if safe_keyspace not in self.known_keyspaces:
|
||||
# Check if keyspace actually exists in Cassandra
|
||||
self.connect_cassandra()
|
||||
check_keyspace_cql = """
|
||||
SELECT keyspace_name FROM system_schema.keyspaces
|
||||
WHERE keyspace_name = %s
|
||||
"""
|
||||
result = self.session.execute(check_keyspace_cql, (safe_keyspace,))
|
||||
# Check if result is None (mock case) or has no rows
|
||||
if result is None or not result.one():
|
||||
error_msg = (
|
||||
f"Collection {obj.metadata.collection} does not exist. "
|
||||
f"Create it first with tg-set-collection."
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
# Cache it if it exists
|
||||
self.known_keyspaces.add(safe_keyspace)
|
||||
if safe_keyspace not in self.known_tables:
|
||||
self.known_tables[safe_keyspace] = set()
|
||||
|
||||
# Get schema definition
|
||||
schema = self.schemas.get(obj.schema_name)
|
||||
if not schema:
|
||||
logger.warning(f"No schema found for {obj.schema_name} - skipping")
|
||||
return
|
||||
|
||||
|
||||
# Ensure table exists
|
||||
keyspace = obj.metadata.user
|
||||
table_name = obj.schema_name
|
||||
|
|
@ -428,7 +459,16 @@ class Processor(FlowProcessor):
|
|||
logger.info(f"Received storage management request: {msg.operation} for {msg.user}/{msg.collection}")
|
||||
|
||||
try:
|
||||
if msg.operation == "delete-collection":
|
||||
if msg.operation == "create-collection":
|
||||
await self.create_collection(msg.user, msg.collection)
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(
|
||||
error=None # No error means success
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
logger.info(f"Successfully created collection {msg.user}/{msg.collection}")
|
||||
elif msg.operation == "delete-collection":
|
||||
await self.delete_collection(msg.user, msg.collection)
|
||||
|
||||
# Send success response
|
||||
|
|
@ -459,7 +499,25 @@ class Processor(FlowProcessor):
|
|||
message=str(e)
|
||||
)
|
||||
)
|
||||
await self.send("storage-response", response)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def create_collection(self, user: str, collection: str):
|
||||
"""Create/verify collection exists in Cassandra object store"""
|
||||
# Connect if not already connected
|
||||
self.connect_cassandra()
|
||||
|
||||
# Sanitize names for safety
|
||||
safe_keyspace = self.sanitize_name(user)
|
||||
|
||||
# Ensure keyspace exists
|
||||
if safe_keyspace not in self.known_keyspaces:
|
||||
self.ensure_keyspace(safe_keyspace)
|
||||
self.known_keyspaces.add(safe_keyspace)
|
||||
|
||||
# For Cassandra objects, collection is just a property in rows
|
||||
# No need to create separate tables per collection
|
||||
# Just mark that we've seen this collection
|
||||
logger.info(f"Collection {collection} ready for user {user} (using keyspace {safe_keyspace})")
|
||||
|
||||
async def delete_collection(self, user: str, collection: str):
|
||||
"""Delete all data for a specific collection"""
|
||||
|
|
|
|||
|
|
@ -109,6 +109,15 @@ class Processor(TriplesStoreService):
|
|||
|
||||
self.table = user
|
||||
|
||||
# Validate collection exists before accepting writes
|
||||
if not self.tg.collection_exists(message.metadata.collection):
|
||||
error_msg = (
|
||||
f"Collection {message.metadata.collection} does not exist. "
|
||||
f"Create it first with tg-set-collection."
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
for t in message.triples:
|
||||
self.tg.insert(
|
||||
message.metadata.collection,
|
||||
|
|
@ -117,18 +126,27 @@ class Processor(TriplesStoreService):
|
|||
t.o.value
|
||||
)
|
||||
|
||||
async def on_storage_management(self, message):
|
||||
async def start(self):
|
||||
"""Start the processor and its storage management consumer"""
|
||||
await super().start()
|
||||
await self.storage_request_consumer.start()
|
||||
await self.storage_response_producer.start()
|
||||
|
||||
async def on_storage_management(self, message, consumer, flow):
|
||||
"""Handle storage management requests"""
|
||||
logger.info(f"Storage management request: {message.operation} for {message.user}/{message.collection}")
|
||||
request = message.value()
|
||||
logger.info(f"Storage management request: {request.operation} for {request.user}/{request.collection}")
|
||||
|
||||
try:
|
||||
if message.operation == "delete-collection":
|
||||
await self.handle_delete_collection(message)
|
||||
if request.operation == "create-collection":
|
||||
await self.handle_create_collection(request)
|
||||
elif request.operation == "delete-collection":
|
||||
await self.handle_delete_collection(request)
|
||||
else:
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="invalid_operation",
|
||||
message=f"Unknown operation: {message.operation}"
|
||||
message=f"Unknown operation: {request.operation}"
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
|
@ -143,42 +161,85 @@ class Processor(TriplesStoreService):
|
|||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, message):
|
||||
"""Delete all data for a specific collection from the unified triples table"""
|
||||
async def handle_create_collection(self, request):
|
||||
"""Create a collection in Cassandra triple store"""
|
||||
try:
|
||||
# Create or reuse connection for this user's keyspace
|
||||
if self.table is None or self.table != message.user:
|
||||
if self.table is None or self.table != request.user:
|
||||
self.tg = None
|
||||
|
||||
try:
|
||||
if self.cassandra_username and self.cassandra_password:
|
||||
self.tg = KnowledgeGraph(
|
||||
hosts=self.cassandra_host,
|
||||
keyspace=message.user,
|
||||
keyspace=request.user,
|
||||
username=self.cassandra_username,
|
||||
password=self.cassandra_password
|
||||
)
|
||||
else:
|
||||
self.tg = KnowledgeGraph(
|
||||
hosts=self.cassandra_host,
|
||||
keyspace=message.user,
|
||||
keyspace=request.user,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to connect to Cassandra for user {message.user}: {e}")
|
||||
logger.error(f"Failed to connect to Cassandra for user {request.user}: {e}")
|
||||
raise
|
||||
|
||||
self.table = message.user
|
||||
self.table = request.user
|
||||
|
||||
# Delete all triples for this collection from the unified table
|
||||
# In the unified table schema, collection is the partition key
|
||||
delete_cql = """
|
||||
DELETE FROM triples
|
||||
WHERE collection = ?
|
||||
"""
|
||||
# Create collection using the built-in method
|
||||
logger.info(f"Creating collection {request.collection} for user {request.user}")
|
||||
|
||||
if self.tg.collection_exists(request.collection):
|
||||
logger.info(f"Collection {request.collection} already exists")
|
||||
else:
|
||||
self.tg.create_collection(request.collection)
|
||||
logger.info(f"Created collection {request.collection}")
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(error=None)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create collection: {e}", exc_info=True)
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="creation_error",
|
||||
message=str(e)
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, request):
|
||||
"""Delete all data for a specific collection from the unified triples table"""
|
||||
try:
|
||||
# Create or reuse connection for this user's keyspace
|
||||
if self.table is None or self.table != request.user:
|
||||
self.tg = None
|
||||
|
||||
try:
|
||||
if self.cassandra_username and self.cassandra_password:
|
||||
self.tg = KnowledgeGraph(
|
||||
hosts=self.cassandra_host,
|
||||
keyspace=request.user,
|
||||
username=self.cassandra_username,
|
||||
password=self.cassandra_password
|
||||
)
|
||||
else:
|
||||
self.tg = KnowledgeGraph(
|
||||
hosts=self.cassandra_host,
|
||||
keyspace=request.user,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to connect to Cassandra for user {request.user}: {e}")
|
||||
raise
|
||||
|
||||
self.table = request.user
|
||||
|
||||
# Delete all triples for this collection using the built-in method
|
||||
try:
|
||||
self.tg.session.execute(delete_cql, (message.collection,))
|
||||
logger.info(f"Deleted all triples for collection {message.collection} from keyspace {message.user}")
|
||||
self.tg.delete_collection(request.collection)
|
||||
logger.info(f"Deleted all triples for collection {request.collection} from keyspace {request.user}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete collection data: {e}")
|
||||
raise
|
||||
|
|
@ -188,7 +249,7 @@ class Processor(TriplesStoreService):
|
|||
error=None # No error means success
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
logger.info(f"Successfully deleted collection {message.user}/{message.collection}")
|
||||
logger.info(f"Successfully deleted collection {request.user}/{request.collection}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete collection: {e}")
|
||||
|
|
|
|||
|
|
@ -152,11 +152,43 @@ class Processor(TriplesStoreService):
|
|||
time=res.run_time_ms
|
||||
))
|
||||
|
||||
def collection_exists(self, user, collection):
|
||||
"""Check if collection metadata node exists"""
|
||||
result = self.io.query(
|
||||
"MATCH (c:CollectionMetadata {user: $user, collection: $collection}) "
|
||||
"RETURN c LIMIT 1",
|
||||
params={"user": user, "collection": collection}
|
||||
)
|
||||
return result.result_set is not None and len(result.result_set) > 0
|
||||
|
||||
def create_collection(self, user, collection):
|
||||
"""Create collection metadata node"""
|
||||
import datetime
|
||||
self.io.query(
|
||||
"MERGE (c:CollectionMetadata {user: $user, collection: $collection}) "
|
||||
"SET c.created_at = $created_at",
|
||||
params={
|
||||
"user": user,
|
||||
"collection": collection,
|
||||
"created_at": datetime.datetime.now().isoformat()
|
||||
}
|
||||
)
|
||||
logger.info(f"Created collection metadata node for {user}/{collection}")
|
||||
|
||||
async def store_triples(self, message):
|
||||
# Extract user and collection from metadata
|
||||
user = message.metadata.user if message.metadata.user else "default"
|
||||
collection = message.metadata.collection if message.metadata.collection else "default"
|
||||
|
||||
# Validate collection exists before accepting writes
|
||||
if not self.collection_exists(user, collection):
|
||||
error_msg = (
|
||||
f"Collection {collection} does not exist. "
|
||||
f"Create it first with tg-set-collection."
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
for t in message.triples:
|
||||
|
||||
self.create_node(t.s.value, user, collection)
|
||||
|
|
@ -185,18 +217,27 @@ class Processor(TriplesStoreService):
|
|||
help=f'FalkorDB database (default: {default_database})'
|
||||
)
|
||||
|
||||
async def on_storage_management(self, message):
|
||||
async def start(self):
|
||||
"""Start the processor and its storage management consumer"""
|
||||
await super().start()
|
||||
await self.storage_request_consumer.start()
|
||||
await self.storage_response_producer.start()
|
||||
|
||||
async def on_storage_management(self, message, consumer, flow):
|
||||
"""Handle storage management requests"""
|
||||
logger.info(f"Storage management request: {message.operation} for {message.user}/{message.collection}")
|
||||
request = message.value()
|
||||
logger.info(f"Storage management request: {request.operation} for {request.user}/{request.collection}")
|
||||
|
||||
try:
|
||||
if message.operation == "delete-collection":
|
||||
await self.handle_delete_collection(message)
|
||||
if request.operation == "create-collection":
|
||||
await self.handle_create_collection(request)
|
||||
elif request.operation == "delete-collection":
|
||||
await self.handle_delete_collection(request)
|
||||
else:
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="invalid_operation",
|
||||
message=f"Unknown operation: {message.operation}"
|
||||
message=f"Unknown operation: {request.operation}"
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
|
@ -211,28 +252,57 @@ class Processor(TriplesStoreService):
|
|||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, message):
|
||||
async def handle_create_collection(self, request):
|
||||
"""Create collection metadata in FalkorDB"""
|
||||
try:
|
||||
if self.collection_exists(request.user, request.collection):
|
||||
logger.info(f"Collection {request.user}/{request.collection} already exists")
|
||||
else:
|
||||
self.create_collection(request.user, request.collection)
|
||||
logger.info(f"Created collection {request.user}/{request.collection}")
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(error=None)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create collection: {e}", exc_info=True)
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="creation_error",
|
||||
message=str(e)
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, request):
|
||||
"""Delete the collection for FalkorDB triples"""
|
||||
try:
|
||||
# Delete all nodes and literals for this user/collection
|
||||
node_result = self.io.query(
|
||||
"MATCH (n:Node {user: $user, collection: $collection}) DETACH DELETE n",
|
||||
params={"user": message.user, "collection": message.collection}
|
||||
params={"user": request.user, "collection": request.collection}
|
||||
)
|
||||
|
||||
literal_result = self.io.query(
|
||||
"MATCH (n:Literal {user: $user, collection: $collection}) DETACH DELETE n",
|
||||
params={"user": message.user, "collection": message.collection}
|
||||
params={"user": request.user, "collection": request.collection}
|
||||
)
|
||||
|
||||
logger.info(f"Deleted {node_result.nodes_deleted} nodes and {literal_result.nodes_deleted} literals for collection {message.user}/{message.collection}")
|
||||
# Delete collection metadata node
|
||||
metadata_result = self.io.query(
|
||||
"MATCH (c:CollectionMetadata {user: $user, collection: $collection}) DELETE c",
|
||||
params={"user": request.user, "collection": request.collection}
|
||||
)
|
||||
|
||||
logger.info(f"Deleted {node_result.nodes_deleted} nodes, {literal_result.nodes_deleted} literals, and {metadata_result.nodes_deleted} metadata nodes for collection {request.user}/{request.collection}")
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(
|
||||
error=None # No error means success
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
logger.info(f"Successfully deleted collection {message.user}/{message.collection}")
|
||||
logger.info(f"Successfully deleted collection {request.user}/{request.collection}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete collection: {e}")
|
||||
|
|
|
|||
|
|
@ -267,12 +267,43 @@ class Processor(TriplesStoreService):
|
|||
src=t.s.value, dest=t.o.value, uri=t.p.value, user=user, collection=collection,
|
||||
)
|
||||
|
||||
def collection_exists(self, user, collection):
|
||||
"""Check if collection metadata node exists"""
|
||||
with self.io.session(database=self.db) as session:
|
||||
result = session.run(
|
||||
"MATCH (c:CollectionMetadata {user: $user, collection: $collection}) "
|
||||
"RETURN c LIMIT 1",
|
||||
user=user, collection=collection
|
||||
)
|
||||
return bool(list(result))
|
||||
|
||||
def create_collection(self, user, collection):
|
||||
"""Create collection metadata node"""
|
||||
import datetime
|
||||
with self.io.session(database=self.db) as session:
|
||||
session.run(
|
||||
"MERGE (c:CollectionMetadata {user: $user, collection: $collection}) "
|
||||
"SET c.created_at = $created_at",
|
||||
user=user, collection=collection,
|
||||
created_at=datetime.datetime.now().isoformat()
|
||||
)
|
||||
logger.info(f"Created collection metadata node for {user}/{collection}")
|
||||
|
||||
async def store_triples(self, message):
|
||||
|
||||
# Extract user and collection from metadata
|
||||
user = message.metadata.user if message.metadata.user else "default"
|
||||
collection = message.metadata.collection if message.metadata.collection else "default"
|
||||
|
||||
# Validate collection exists before accepting writes
|
||||
if not self.collection_exists(user, collection):
|
||||
error_msg = (
|
||||
f"Collection {collection} does not exist. "
|
||||
f"Create it first with tg-set-collection."
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
for t in message.triples:
|
||||
|
||||
self.create_node(t.s.value, user, collection)
|
||||
|
|
@ -317,18 +348,27 @@ class Processor(TriplesStoreService):
|
|||
help=f'Memgraph database (default: {default_database})'
|
||||
)
|
||||
|
||||
async def on_storage_management(self, message):
|
||||
async def start(self):
|
||||
"""Start the processor and its storage management consumer"""
|
||||
await super().start()
|
||||
await self.storage_request_consumer.start()
|
||||
await self.storage_response_producer.start()
|
||||
|
||||
async def on_storage_management(self, message, consumer, flow):
|
||||
"""Handle storage management requests"""
|
||||
logger.info(f"Storage management request: {message.operation} for {message.user}/{message.collection}")
|
||||
request = message.value()
|
||||
logger.info(f"Storage management request: {request.operation} for {request.user}/{request.collection}")
|
||||
|
||||
try:
|
||||
if message.operation == "delete-collection":
|
||||
await self.handle_delete_collection(message)
|
||||
if request.operation == "create-collection":
|
||||
await self.handle_create_collection(request)
|
||||
elif request.operation == "delete-collection":
|
||||
await self.handle_delete_collection(request)
|
||||
else:
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="invalid_operation",
|
||||
message=f"Unknown operation: {message.operation}"
|
||||
message=f"Unknown operation: {request.operation}"
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
|
@ -343,7 +383,30 @@ class Processor(TriplesStoreService):
|
|||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, message):
|
||||
async def handle_create_collection(self, request):
|
||||
"""Create collection metadata in Memgraph"""
|
||||
try:
|
||||
if self.collection_exists(request.user, request.collection):
|
||||
logger.info(f"Collection {request.user}/{request.collection} already exists")
|
||||
else:
|
||||
self.create_collection(request.user, request.collection)
|
||||
logger.info(f"Created collection {request.user}/{request.collection}")
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(error=None)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create collection: {e}", exc_info=True)
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="creation_error",
|
||||
message=str(e)
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, request):
|
||||
"""Delete all data for a specific collection"""
|
||||
try:
|
||||
with self.io.session(database=self.db) as session:
|
||||
|
|
@ -351,7 +414,7 @@ class Processor(TriplesStoreService):
|
|||
node_result = session.run(
|
||||
"MATCH (n:Node {user: $user, collection: $collection}) "
|
||||
"DETACH DELETE n",
|
||||
user=message.user, collection=message.collection
|
||||
user=request.user, collection=request.collection
|
||||
)
|
||||
nodes_deleted = node_result.consume().counters.nodes_deleted
|
||||
|
||||
|
|
@ -359,20 +422,28 @@ class Processor(TriplesStoreService):
|
|||
literal_result = session.run(
|
||||
"MATCH (n:Literal {user: $user, collection: $collection}) "
|
||||
"DETACH DELETE n",
|
||||
user=message.user, collection=message.collection
|
||||
user=request.user, collection=request.collection
|
||||
)
|
||||
literals_deleted = literal_result.consume().counters.nodes_deleted
|
||||
|
||||
# Delete collection metadata node
|
||||
metadata_result = session.run(
|
||||
"MATCH (c:CollectionMetadata {user: $user, collection: $collection}) "
|
||||
"DELETE c",
|
||||
user=request.user, collection=request.collection
|
||||
)
|
||||
metadata_deleted = metadata_result.consume().counters.nodes_deleted
|
||||
|
||||
# Note: Relationships are automatically deleted with DETACH DELETE
|
||||
|
||||
logger.info(f"Deleted {nodes_deleted} nodes and {literals_deleted} literals for {message.user}/{message.collection}")
|
||||
logger.info(f"Deleted {nodes_deleted} nodes, {literals_deleted} literals, and {metadata_deleted} metadata nodes for {request.user}/{request.collection}")
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(
|
||||
error=None # No error means success
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
logger.info(f"Successfully deleted collection {message.user}/{message.collection}")
|
||||
logger.info(f"Successfully deleted collection {request.user}/{request.collection}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete collection: {e}")
|
||||
|
|
|
|||
|
|
@ -228,6 +228,15 @@ class Processor(TriplesStoreService):
|
|||
user = message.metadata.user if message.metadata.user else "default"
|
||||
collection = message.metadata.collection if message.metadata.collection else "default"
|
||||
|
||||
# Validate collection exists before accepting writes
|
||||
if not self.collection_exists(user, collection):
|
||||
error_msg = (
|
||||
f"Collection {collection} does not exist. "
|
||||
f"Create it first with tg-set-collection."
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
for t in message.triples:
|
||||
|
||||
self.create_node(t.s.value, user, collection)
|
||||
|
|
@ -268,18 +277,27 @@ class Processor(TriplesStoreService):
|
|||
help=f'Neo4j database (default: {default_database})'
|
||||
)
|
||||
|
||||
async def on_storage_management(self, message):
|
||||
async def start(self):
|
||||
"""Start the processor and its storage management consumer"""
|
||||
await super().start()
|
||||
await self.storage_request_consumer.start()
|
||||
await self.storage_response_producer.start()
|
||||
|
||||
async def on_storage_management(self, message, consumer, flow):
|
||||
"""Handle storage management requests"""
|
||||
logger.info(f"Storage management request: {message.operation} for {message.user}/{message.collection}")
|
||||
request = message.value()
|
||||
logger.info(f"Storage management request: {request.operation} for {request.user}/{request.collection}")
|
||||
|
||||
try:
|
||||
if message.operation == "delete-collection":
|
||||
await self.handle_delete_collection(message)
|
||||
if request.operation == "create-collection":
|
||||
await self.handle_create_collection(request)
|
||||
elif request.operation == "delete-collection":
|
||||
await self.handle_delete_collection(request)
|
||||
else:
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="invalid_operation",
|
||||
message=f"Unknown operation: {message.operation}"
|
||||
message=f"Unknown operation: {request.operation}"
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
|
@ -294,7 +312,52 @@ class Processor(TriplesStoreService):
|
|||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, message):
|
||||
def collection_exists(self, user, collection):
|
||||
"""Check if collection metadata node exists"""
|
||||
with self.io.session(database=self.db) as session:
|
||||
result = session.run(
|
||||
"MATCH (c:CollectionMetadata {user: $user, collection: $collection}) "
|
||||
"RETURN c LIMIT 1",
|
||||
user=user, collection=collection
|
||||
)
|
||||
return bool(list(result))
|
||||
|
||||
def create_collection(self, user, collection):
|
||||
"""Create collection metadata node"""
|
||||
import datetime
|
||||
with self.io.session(database=self.db) as session:
|
||||
session.run(
|
||||
"MERGE (c:CollectionMetadata {user: $user, collection: $collection}) "
|
||||
"SET c.created_at = $created_at",
|
||||
user=user, collection=collection,
|
||||
created_at=datetime.datetime.now().isoformat()
|
||||
)
|
||||
logger.info(f"Created collection metadata node for {user}/{collection}")
|
||||
|
||||
async def handle_create_collection(self, request):
|
||||
"""Create collection metadata in Neo4j"""
|
||||
try:
|
||||
if self.collection_exists(request.user, request.collection):
|
||||
logger.info(f"Collection {request.user}/{request.collection} already exists")
|
||||
else:
|
||||
self.create_collection(request.user, request.collection)
|
||||
logger.info(f"Created collection {request.user}/{request.collection}")
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(error=None)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create collection: {e}", exc_info=True)
|
||||
response = StorageManagementResponse(
|
||||
error=Error(
|
||||
type="creation_error",
|
||||
message=str(e)
|
||||
)
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
|
||||
async def handle_delete_collection(self, request):
|
||||
"""Delete all data for a specific collection"""
|
||||
try:
|
||||
with self.io.session(database=self.db) as session:
|
||||
|
|
@ -302,7 +365,7 @@ class Processor(TriplesStoreService):
|
|||
node_result = session.run(
|
||||
"MATCH (n:Node {user: $user, collection: $collection}) "
|
||||
"DETACH DELETE n",
|
||||
user=message.user, collection=message.collection
|
||||
user=request.user, collection=request.collection
|
||||
)
|
||||
nodes_deleted = node_result.consume().counters.nodes_deleted
|
||||
|
||||
|
|
@ -310,20 +373,28 @@ class Processor(TriplesStoreService):
|
|||
literal_result = session.run(
|
||||
"MATCH (n:Literal {user: $user, collection: $collection}) "
|
||||
"DETACH DELETE n",
|
||||
user=message.user, collection=message.collection
|
||||
user=request.user, collection=request.collection
|
||||
)
|
||||
literals_deleted = literal_result.consume().counters.nodes_deleted
|
||||
|
||||
# Note: Relationships are automatically deleted with DETACH DELETE
|
||||
|
||||
logger.info(f"Deleted {nodes_deleted} nodes and {literals_deleted} literals for {message.user}/{message.collection}")
|
||||
# Delete collection metadata node
|
||||
metadata_result = session.run(
|
||||
"MATCH (c:CollectionMetadata {user: $user, collection: $collection}) "
|
||||
"DELETE c",
|
||||
user=request.user, collection=request.collection
|
||||
)
|
||||
metadata_deleted = metadata_result.consume().counters.nodes_deleted
|
||||
|
||||
logger.info(f"Deleted {nodes_deleted} nodes, {literals_deleted} literals, and {metadata_deleted} metadata nodes for {request.user}/{request.collection}")
|
||||
|
||||
# Send success response
|
||||
response = StorageManagementResponse(
|
||||
error=None # No error means success
|
||||
)
|
||||
await self.storage_response_producer.send(response)
|
||||
logger.info(f"Successfully deleted collection {message.user}/{message.collection}")
|
||||
logger.info(f"Successfully deleted collection {request.user}/{request.collection}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete collection: {e}")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue