mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-01 19:32:38 +02:00
Collection delete pt. 3 (#542)
* Fixing collection deletion * Fixing collection management param error * Always test for collections * Add Cassandra collection table * Updated tech spec for explicit creation/deletion * Remove implicit collection creation * Fix up collection tracking in all processors
This commit is contained in:
parent
dc79b10552
commit
52b133fc86
31 changed files with 1761 additions and 843 deletions
|
|
@ -24,16 +24,12 @@ class KnowledgeGraph:
|
|||
self.keyspace = keyspace
|
||||
self.username = username
|
||||
|
||||
# Multi-table schema design for optimal performance
|
||||
self.use_legacy = os.getenv('CASSANDRA_USE_LEGACY', 'false').lower() == 'true'
|
||||
|
||||
if self.use_legacy:
|
||||
self.table = "triples" # Legacy single table
|
||||
else:
|
||||
# New optimized tables
|
||||
self.subject_table = "triples_s"
|
||||
self.po_table = "triples_p"
|
||||
self.object_table = "triples_o"
|
||||
# Optimized multi-table schema with collection deletion support
|
||||
self.subject_table = "triples_s"
|
||||
self.po_table = "triples_p"
|
||||
self.object_table = "triples_o"
|
||||
self.collection_table = "triples_collection" # For SPO queries and deletion
|
||||
self.collection_metadata_table = "collection_metadata" # For tracking which collections exist
|
||||
|
||||
if username and password:
|
||||
ssl_context = SSLContext(PROTOCOL_TLSv1_2)
|
||||
|
|
@ -47,9 +43,7 @@ class KnowledgeGraph:
|
|||
_active_clusters.append(self.cluster)
|
||||
|
||||
self.init()
|
||||
|
||||
if not self.use_legacy:
|
||||
self.prepare_statements()
|
||||
self.prepare_statements()
|
||||
|
||||
def clear(self):
|
||||
|
||||
|
|
@ -70,42 +64,13 @@ class KnowledgeGraph:
|
|||
""");
|
||||
|
||||
self.session.set_keyspace(self.keyspace)
|
||||
self.init_optimized_schema()
|
||||
|
||||
if self.use_legacy:
|
||||
self.init_legacy_schema()
|
||||
else:
|
||||
self.init_optimized_schema()
|
||||
|
||||
def init_legacy_schema(self):
|
||||
"""Initialize legacy single-table schema for backward compatibility"""
|
||||
self.session.execute(f"""
|
||||
create table if not exists {self.table} (
|
||||
collection text,
|
||||
s text,
|
||||
p text,
|
||||
o text,
|
||||
PRIMARY KEY (collection, s, p, o)
|
||||
);
|
||||
""");
|
||||
|
||||
self.session.execute(f"""
|
||||
create index if not exists {self.table}_s
|
||||
ON {self.table} (s);
|
||||
""");
|
||||
|
||||
self.session.execute(f"""
|
||||
create index if not exists {self.table}_p
|
||||
ON {self.table} (p);
|
||||
""");
|
||||
|
||||
self.session.execute(f"""
|
||||
create index if not exists {self.table}_o
|
||||
ON {self.table} (o);
|
||||
""");
|
||||
|
||||
def init_optimized_schema(self):
|
||||
"""Initialize optimized multi-table schema for performance"""
|
||||
# Table 1: Subject-centric queries (get_s, get_sp, get_spo, get_os)
|
||||
# Table 1: Subject-centric queries (get_s, get_sp, get_os)
|
||||
# Compound partition key for optimal data distribution
|
||||
self.session.execute(f"""
|
||||
CREATE TABLE IF NOT EXISTS {self.subject_table} (
|
||||
collection text,
|
||||
|
|
@ -117,6 +82,7 @@ class KnowledgeGraph:
|
|||
""");
|
||||
|
||||
# Table 2: Predicate-Object queries (get_p, get_po) - eliminates ALLOW FILTERING!
|
||||
# Compound partition key for optimal data distribution
|
||||
self.session.execute(f"""
|
||||
CREATE TABLE IF NOT EXISTS {self.po_table} (
|
||||
collection text,
|
||||
|
|
@ -128,6 +94,7 @@ class KnowledgeGraph:
|
|||
""");
|
||||
|
||||
# Table 3: Object-centric queries (get_o)
|
||||
# Compound partition key for optimal data distribution
|
||||
self.session.execute(f"""
|
||||
CREATE TABLE IF NOT EXISTS {self.object_table} (
|
||||
collection text,
|
||||
|
|
@ -138,7 +105,29 @@ class KnowledgeGraph:
|
|||
);
|
||||
""");
|
||||
|
||||
logger.info("Optimized multi-table schema initialized")
|
||||
# Table 4: Collection management and SPO queries (get_spo)
|
||||
# Simple partition key enables efficient collection deletion
|
||||
self.session.execute(f"""
|
||||
CREATE TABLE IF NOT EXISTS {self.collection_table} (
|
||||
collection text,
|
||||
s text,
|
||||
p text,
|
||||
o text,
|
||||
PRIMARY KEY (collection, s, p, o)
|
||||
);
|
||||
""");
|
||||
|
||||
# Table 5: Collection metadata tracking
|
||||
# Tracks which collections exist without polluting triple data
|
||||
self.session.execute(f"""
|
||||
CREATE TABLE IF NOT EXISTS {self.collection_metadata_table} (
|
||||
collection text,
|
||||
created_at timestamp,
|
||||
PRIMARY KEY (collection)
|
||||
);
|
||||
""");
|
||||
|
||||
logger.info("Optimized multi-table schema initialized (5 tables)")
|
||||
|
||||
def prepare_statements(self):
|
||||
"""Prepare statements for optimal performance"""
|
||||
|
|
@ -155,6 +144,10 @@ class KnowledgeGraph:
|
|||
f"INSERT INTO {self.object_table} (collection, o, s, p) VALUES (?, ?, ?, ?)"
|
||||
)
|
||||
|
||||
self.insert_collection_stmt = self.session.prepare(
|
||||
f"INSERT INTO {self.collection_table} (collection, s, p, o) VALUES (?, ?, ?, ?)"
|
||||
)
|
||||
|
||||
# Query statements for optimized access
|
||||
self.get_all_stmt = self.session.prepare(
|
||||
f"SELECT s, p, o FROM {self.subject_table} WHERE collection = ? LIMIT ? ALLOW FILTERING"
|
||||
|
|
@ -186,158 +179,168 @@ class KnowledgeGraph:
|
|||
)
|
||||
|
||||
self.get_spo_stmt = self.session.prepare(
|
||||
f"SELECT s as x FROM {self.subject_table} WHERE collection = ? AND s = ? AND p = ? AND o = ? LIMIT ?"
|
||||
f"SELECT s as x FROM {self.collection_table} WHERE collection = ? AND s = ? AND p = ? AND o = ? LIMIT ?"
|
||||
)
|
||||
|
||||
logger.info("Prepared statements initialized for optimal performance")
|
||||
logger.info("Prepared statements initialized for optimal performance (4 tables)")
|
||||
|
||||
def insert(self, collection, s, p, o):
|
||||
# Batch write to all four tables for consistency
|
||||
batch = BatchStatement()
|
||||
|
||||
if self.use_legacy:
|
||||
self.session.execute(
|
||||
f"insert into {self.table} (collection, s, p, o) values (%s, %s, %s, %s)",
|
||||
(collection, s, p, o)
|
||||
)
|
||||
else:
|
||||
# Batch write to all three tables for consistency
|
||||
batch = BatchStatement()
|
||||
# Insert into subject table
|
||||
batch.add(self.insert_subject_stmt, (collection, s, p, o))
|
||||
|
||||
# Insert into subject table
|
||||
batch.add(self.insert_subject_stmt, (collection, s, p, o))
|
||||
# Insert into predicate-object table (column order: collection, p, o, s)
|
||||
batch.add(self.insert_po_stmt, (collection, p, o, s))
|
||||
|
||||
# Insert into predicate-object table (column order: collection, p, o, s)
|
||||
batch.add(self.insert_po_stmt, (collection, p, o, s))
|
||||
# Insert into object table (column order: collection, o, s, p)
|
||||
batch.add(self.insert_object_stmt, (collection, o, s, p))
|
||||
|
||||
# Insert into object table (column order: collection, o, s, p)
|
||||
batch.add(self.insert_object_stmt, (collection, o, s, p))
|
||||
# Insert into collection table for SPO queries and deletion tracking
|
||||
batch.add(self.insert_collection_stmt, (collection, s, p, o))
|
||||
|
||||
self.session.execute(batch)
|
||||
self.session.execute(batch)
|
||||
|
||||
def get_all(self, collection, limit=50):
|
||||
if self.use_legacy:
|
||||
return self.session.execute(
|
||||
f"select s, p, o from {self.table} where collection = %s limit {limit}",
|
||||
(collection,)
|
||||
)
|
||||
else:
|
||||
# Use subject table for get_all queries
|
||||
return self.session.execute(
|
||||
self.get_all_stmt,
|
||||
(collection, limit)
|
||||
)
|
||||
# Use subject table for get_all queries
|
||||
return self.session.execute(
|
||||
self.get_all_stmt,
|
||||
(collection, limit)
|
||||
)
|
||||
|
||||
def get_s(self, collection, s, limit=10):
|
||||
if self.use_legacy:
|
||||
return self.session.execute(
|
||||
f"select p, o from {self.table} where collection = %s and s = %s limit {limit}",
|
||||
(collection, s)
|
||||
)
|
||||
else:
|
||||
# Optimized: Direct partition access with (collection, s)
|
||||
return self.session.execute(
|
||||
self.get_s_stmt,
|
||||
(collection, s, limit)
|
||||
)
|
||||
# Optimized: Direct partition access with (collection, s)
|
||||
return self.session.execute(
|
||||
self.get_s_stmt,
|
||||
(collection, s, limit)
|
||||
)
|
||||
|
||||
def get_p(self, collection, p, limit=10):
|
||||
if self.use_legacy:
|
||||
return self.session.execute(
|
||||
f"select s, o from {self.table} where collection = %s and p = %s limit {limit}",
|
||||
(collection, p)
|
||||
)
|
||||
else:
|
||||
# Optimized: Use po_table for direct partition access
|
||||
return self.session.execute(
|
||||
self.get_p_stmt,
|
||||
(collection, p, limit)
|
||||
)
|
||||
# Optimized: Use po_table for direct partition access
|
||||
return self.session.execute(
|
||||
self.get_p_stmt,
|
||||
(collection, p, limit)
|
||||
)
|
||||
|
||||
def get_o(self, collection, o, limit=10):
|
||||
if self.use_legacy:
|
||||
return self.session.execute(
|
||||
f"select s, p from {self.table} where collection = %s and o = %s limit {limit}",
|
||||
(collection, o)
|
||||
)
|
||||
else:
|
||||
# Optimized: Use object_table for direct partition access
|
||||
return self.session.execute(
|
||||
self.get_o_stmt,
|
||||
(collection, o, limit)
|
||||
)
|
||||
# Optimized: Use object_table for direct partition access
|
||||
return self.session.execute(
|
||||
self.get_o_stmt,
|
||||
(collection, o, limit)
|
||||
)
|
||||
|
||||
def get_sp(self, collection, s, p, limit=10):
|
||||
if self.use_legacy:
|
||||
return self.session.execute(
|
||||
f"select o from {self.table} where collection = %s and s = %s and p = %s limit {limit}",
|
||||
(collection, s, p)
|
||||
)
|
||||
else:
|
||||
# Optimized: Use subject_table with clustering key access
|
||||
return self.session.execute(
|
||||
self.get_sp_stmt,
|
||||
(collection, s, p, limit)
|
||||
)
|
||||
# Optimized: Use subject_table with clustering key access
|
||||
return self.session.execute(
|
||||
self.get_sp_stmt,
|
||||
(collection, s, p, limit)
|
||||
)
|
||||
|
||||
def get_po(self, collection, p, o, limit=10):
|
||||
if self.use_legacy:
|
||||
return self.session.execute(
|
||||
f"select s from {self.table} where collection = %s and p = %s and o = %s limit {limit} allow filtering",
|
||||
(collection, p, o)
|
||||
)
|
||||
else:
|
||||
# CRITICAL OPTIMIZATION: Use po_table - NO MORE ALLOW FILTERING!
|
||||
return self.session.execute(
|
||||
self.get_po_stmt,
|
||||
(collection, p, o, limit)
|
||||
)
|
||||
# CRITICAL OPTIMIZATION: Use po_table - NO MORE ALLOW FILTERING!
|
||||
return self.session.execute(
|
||||
self.get_po_stmt,
|
||||
(collection, p, o, limit)
|
||||
)
|
||||
|
||||
def get_os(self, collection, o, s, limit=10):
|
||||
if self.use_legacy:
|
||||
return self.session.execute(
|
||||
f"select p from {self.table} where collection = %s and o = %s and s = %s limit {limit} allow filtering",
|
||||
(collection, o, s)
|
||||
)
|
||||
else:
|
||||
# Optimized: Use subject_table with clustering access (no more ALLOW FILTERING)
|
||||
return self.session.execute(
|
||||
self.get_os_stmt,
|
||||
(collection, s, o, limit)
|
||||
)
|
||||
# Optimized: Use subject_table with clustering access (no more ALLOW FILTERING)
|
||||
return self.session.execute(
|
||||
self.get_os_stmt,
|
||||
(collection, s, o, limit)
|
||||
)
|
||||
|
||||
def get_spo(self, collection, s, p, o, limit=10):
|
||||
if self.use_legacy:
|
||||
return self.session.execute(
|
||||
f"""select s as x from {self.table} where collection = %s and s = %s and p = %s and o = %s limit {limit}""",
|
||||
(collection, s, p, o)
|
||||
# Optimized: Use collection_table for exact key lookup
|
||||
return self.session.execute(
|
||||
self.get_spo_stmt,
|
||||
(collection, s, p, o, limit)
|
||||
)
|
||||
|
||||
def collection_exists(self, collection):
|
||||
"""Check if collection exists by querying collection_metadata table"""
|
||||
try:
|
||||
result = self.session.execute(
|
||||
f"SELECT collection FROM {self.collection_metadata_table} WHERE collection = %s LIMIT 1",
|
||||
(collection,)
|
||||
)
|
||||
else:
|
||||
# Optimized: Use subject_table for exact key lookup
|
||||
return self.session.execute(
|
||||
self.get_spo_stmt,
|
||||
(collection, s, p, o, limit)
|
||||
return bool(list(result))
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking collection existence: {e}")
|
||||
return False
|
||||
|
||||
def create_collection(self, collection):
|
||||
"""Create collection by inserting metadata row"""
|
||||
try:
|
||||
import datetime
|
||||
self.session.execute(
|
||||
f"INSERT INTO {self.collection_metadata_table} (collection, created_at) VALUES (%s, %s)",
|
||||
(collection, datetime.datetime.now())
|
||||
)
|
||||
logger.info(f"Created collection metadata for {collection}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating collection: {e}")
|
||||
raise e
|
||||
|
||||
def delete_collection(self, collection):
|
||||
"""Delete all triples for a specific collection"""
|
||||
if self.use_legacy:
|
||||
self.session.execute(
|
||||
f"delete from {self.table} where collection = %s",
|
||||
(collection,)
|
||||
)
|
||||
else:
|
||||
# Delete from all three tables
|
||||
self.session.execute(
|
||||
f"delete from {self.subject_table} where collection = %s",
|
||||
(collection,)
|
||||
)
|
||||
self.session.execute(
|
||||
f"delete from {self.po_table} where collection = %s",
|
||||
(collection,)
|
||||
)
|
||||
self.session.execute(
|
||||
f"delete from {self.object_table} where collection = %s",
|
||||
(collection,)
|
||||
)
|
||||
"""Delete all triples for a specific collection
|
||||
|
||||
Uses collection_table to enumerate all triples, then deletes from all 4 tables
|
||||
using full partition keys for optimal performance with compound keys.
|
||||
"""
|
||||
# Step 1: Read all triples from collection_table (single partition read)
|
||||
rows = self.session.execute(
|
||||
f"SELECT s, p, o FROM {self.collection_table} WHERE collection = %s",
|
||||
(collection,)
|
||||
)
|
||||
|
||||
# Step 2: Delete each triple from all 4 tables using full partition keys
|
||||
# Batch deletions for efficiency
|
||||
batch = BatchStatement()
|
||||
count = 0
|
||||
|
||||
for row in rows:
|
||||
s, p, o = row.s, row.p, row.o
|
||||
|
||||
# Delete from subject table (partition key: collection, s)
|
||||
batch.add(SimpleStatement(
|
||||
f"DELETE FROM {self.subject_table} WHERE collection = ? AND s = ? AND p = ? AND o = ?"
|
||||
), (collection, s, p, o))
|
||||
|
||||
# Delete from predicate-object table (partition key: collection, p)
|
||||
batch.add(SimpleStatement(
|
||||
f"DELETE FROM {self.po_table} WHERE collection = ? AND p = ? AND o = ? AND s = ?"
|
||||
), (collection, p, o, s))
|
||||
|
||||
# Delete from object table (partition key: collection, o)
|
||||
batch.add(SimpleStatement(
|
||||
f"DELETE FROM {self.object_table} WHERE collection = ? AND o = ? AND s = ? AND p = ?"
|
||||
), (collection, o, s, p))
|
||||
|
||||
# Delete from collection table (partition key: collection only)
|
||||
batch.add(SimpleStatement(
|
||||
f"DELETE FROM {self.collection_table} WHERE collection = ? AND s = ? AND p = ? AND o = ?"
|
||||
), (collection, s, p, o))
|
||||
|
||||
count += 1
|
||||
|
||||
# Execute batch every 100 triples to avoid oversized batches
|
||||
if count % 100 == 0:
|
||||
self.session.execute(batch)
|
||||
batch = BatchStatement()
|
||||
|
||||
# Execute remaining deletions
|
||||
if count % 100 != 0:
|
||||
self.session.execute(batch)
|
||||
|
||||
# Step 3: Delete collection metadata
|
||||
self.session.execute(
|
||||
f"DELETE FROM {self.collection_metadata_table} WHERE collection = %s",
|
||||
(collection,)
|
||||
)
|
||||
|
||||
logger.info(f"Deleted {count} triples from collection {collection}")
|
||||
|
||||
def close(self):
|
||||
"""Close the Cassandra session and cluster connections properly"""
|
||||
|
|
|
|||
|
|
@ -49,6 +49,22 @@ class DocVectors:
|
|||
self.next_reload = time.time() + self.reload_time
|
||||
logger.debug(f"Reload at {self.next_reload}")
|
||||
|
||||
def collection_exists(self, user, collection):
|
||||
"""Check if collection exists (dimension-independent check)"""
|
||||
collection_name = make_safe_collection_name(user, collection, self.prefix)
|
||||
return self.client.has_collection(collection_name)
|
||||
|
||||
def create_collection(self, user, collection, dimension=384):
|
||||
"""Create collection with default dimension"""
|
||||
collection_name = make_safe_collection_name(user, collection, self.prefix)
|
||||
|
||||
if self.client.has_collection(collection_name):
|
||||
logger.info(f"Collection {collection_name} already exists")
|
||||
return
|
||||
|
||||
self.init_collection(dimension, user, collection)
|
||||
logger.info(f"Created Milvus collection {collection_name} with dimension {dimension}")
|
||||
|
||||
def init_collection(self, dimension, user, collection):
|
||||
|
||||
collection_name = make_safe_collection_name(user, collection, self.prefix)
|
||||
|
|
|
|||
|
|
@ -49,6 +49,22 @@ class EntityVectors:
|
|||
self.next_reload = time.time() + self.reload_time
|
||||
logger.debug(f"Reload at {self.next_reload}")
|
||||
|
||||
def collection_exists(self, user, collection):
|
||||
"""Check if collection exists (dimension-independent check)"""
|
||||
collection_name = make_safe_collection_name(user, collection, self.prefix)
|
||||
return self.client.has_collection(collection_name)
|
||||
|
||||
def create_collection(self, user, collection, dimension=384):
|
||||
"""Create collection with default dimension"""
|
||||
collection_name = make_safe_collection_name(user, collection, self.prefix)
|
||||
|
||||
if self.client.has_collection(collection_name):
|
||||
logger.info(f"Collection {collection_name} already exists")
|
||||
return
|
||||
|
||||
self.init_collection(dimension, user, collection)
|
||||
logger.info(f"Created Milvus collection {collection_name} with dimension {dimension}")
|
||||
|
||||
def init_collection(self, dimension, user, collection):
|
||||
|
||||
collection_name = make_safe_collection_name(user, collection, self.prefix)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue