Collection management (#520)

* Tech spec

* Refactored Cassanda knowledge graph for single table

* Collection management, librarian services to manage metadata and collection deletion
This commit is contained in:
cybermaggedon 2025-09-18 15:57:52 +01:00 committed by GitHub
parent 48016d8fb2
commit 13ff7d765d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
48 changed files with 2941 additions and 425 deletions

View file

@ -6,18 +6,18 @@ from ssl import SSLContext, PROTOCOL_TLSv1_2
# Global list to track clusters for cleanup
_active_clusters = []
class TrustGraph:
class KnowledgeGraph:
def __init__(
self, hosts=None,
keyspace="trustgraph", table="default", username=None, password=None
keyspace="trustgraph", username=None, password=None
):
if hosts is None:
hosts = ["localhost"]
self.keyspace = keyspace
self.table = table
self.table = "triples" # Fixed table name for unified schema
self.username = username
if username and password:
@ -55,13 +55,19 @@ class TrustGraph:
self.session.execute(f"""
create table if not exists {self.table} (
collection text,
s text,
p text,
o text,
PRIMARY KEY (s, p, o)
PRIMARY KEY (collection, s, p, o)
);
""");
self.session.execute(f"""
create index if not exists {self.table}_s
ON {self.table} (s);
""");
self.session.execute(f"""
create index if not exists {self.table}_p
ON {self.table} (p);
@ -72,58 +78,66 @@ class TrustGraph:
ON {self.table} (o);
""");
def insert(self, s, p, o):
def insert(self, collection, s, p, o):
self.session.execute(
f"insert into {self.table} (s, p, o) values (%s, %s, %s)",
(s, p, o)
f"insert into {self.table} (collection, s, p, o) values (%s, %s, %s, %s)",
(collection, s, p, o)
)
def get_all(self, limit=50):
def get_all(self, collection, limit=50):
return self.session.execute(
f"select s, p, o from {self.table} limit {limit}"
f"select s, p, o from {self.table} where collection = %s limit {limit}",
(collection,)
)
def get_s(self, s, limit=10):
def get_s(self, collection, s, limit=10):
return self.session.execute(
f"select p, o from {self.table} where s = %s limit {limit}",
(s,)
f"select p, o from {self.table} where collection = %s and s = %s limit {limit}",
(collection, s)
)
def get_p(self, p, limit=10):
def get_p(self, collection, p, limit=10):
return self.session.execute(
f"select s, o from {self.table} where p = %s limit {limit}",
(p,)
f"select s, o from {self.table} where collection = %s and p = %s limit {limit}",
(collection, p)
)
def get_o(self, o, limit=10):
def get_o(self, collection, o, limit=10):
return self.session.execute(
f"select s, p from {self.table} where o = %s limit {limit}",
(o,)
f"select s, p from {self.table} where collection = %s and o = %s limit {limit}",
(collection, o)
)
def get_sp(self, s, p, limit=10):
def get_sp(self, collection, s, p, limit=10):
return self.session.execute(
f"select o from {self.table} where s = %s and p = %s limit {limit}",
(s, p)
f"select o from {self.table} where collection = %s and s = %s and p = %s limit {limit}",
(collection, s, p)
)
def get_po(self, p, o, limit=10):
def get_po(self, collection, p, o, limit=10):
return self.session.execute(
f"select s from {self.table} where p = %s and o = %s limit {limit} allow filtering",
(p, o)
f"select s from {self.table} where collection = %s and p = %s and o = %s limit {limit} allow filtering",
(collection, p, o)
)
def get_os(self, o, s, limit=10):
def get_os(self, collection, o, s, limit=10):
return self.session.execute(
f"select p from {self.table} where o = %s and s = %s limit {limit}",
(o, s)
f"select p from {self.table} where collection = %s and o = %s and s = %s limit {limit} allow filtering",
(collection, o, s)
)
def get_spo(self, s, p, o, limit=10):
def get_spo(self, collection, s, p, o, limit=10):
return self.session.execute(
f"""select s as x from {self.table} where s = %s and p = %s and o = %s limit {limit}""",
(s, p, o)
f"""select s as x from {self.table} where collection = %s and s = %s and p = %s and o = %s limit {limit}""",
(collection, s, p, o)
)
def delete_collection(self, collection):
"""Delete all triples for a specific collection"""
self.session.execute(
f"delete from {self.table} where collection = %s",
(collection,)
)
def close(self):

View file

@ -6,7 +6,7 @@ import re
logger = logging.getLogger(__name__)
def make_safe_collection_name(user, collection, dimension, prefix):
def make_safe_collection_name(user, collection, prefix):
"""
Create a safe Milvus collection name from user/collection parameters.
Milvus only allows letters, numbers, and underscores.
@ -26,7 +26,7 @@ def make_safe_collection_name(user, collection, dimension, prefix):
safe_user = sanitize(user)
safe_collection = sanitize(collection)
return f"{prefix}_{safe_user}_{safe_collection}_{dimension}"
return f"{prefix}_{safe_user}_{safe_collection}"
class DocVectors:
@ -51,7 +51,7 @@ class DocVectors:
def init_collection(self, dimension, user, collection):
collection_name = make_safe_collection_name(user, collection, dimension, self.prefix)
collection_name = make_safe_collection_name(user, collection, self.prefix)
pkey_field = FieldSchema(
name="id",
@ -162,3 +162,20 @@ class DocVectors:
return res
def delete_collection(self, user, collection):
"""Delete a collection for the given user and collection"""
collection_name = make_safe_collection_name(user, collection, self.prefix)
# Check if collection exists
if self.client.has_collection(collection_name):
# Drop the collection
self.client.drop_collection(collection_name)
logger.info(f"Deleted Milvus collection: {collection_name}")
# Remove from our local cache
keys_to_remove = [key for key in self.collections.keys() if key[1] == user and key[2] == collection]
for key in keys_to_remove:
del self.collections[key]
else:
logger.info(f"Collection {collection_name} does not exist, nothing to delete")

View file

@ -6,7 +6,7 @@ import re
logger = logging.getLogger(__name__)
def make_safe_collection_name(user, collection, dimension, prefix):
def make_safe_collection_name(user, collection, prefix):
"""
Create a safe Milvus collection name from user/collection parameters.
Milvus only allows letters, numbers, and underscores.
@ -26,7 +26,7 @@ def make_safe_collection_name(user, collection, dimension, prefix):
safe_user = sanitize(user)
safe_collection = sanitize(collection)
return f"{prefix}_{safe_user}_{safe_collection}_{dimension}"
return f"{prefix}_{safe_user}_{safe_collection}"
class EntityVectors:
@ -51,7 +51,7 @@ class EntityVectors:
def init_collection(self, dimension, user, collection):
collection_name = make_safe_collection_name(user, collection, dimension, self.prefix)
collection_name = make_safe_collection_name(user, collection, self.prefix)
pkey_field = FieldSchema(
name="id",
@ -162,3 +162,20 @@ class EntityVectors:
return res
def delete_collection(self, user, collection):
"""Delete a collection for the given user and collection"""
collection_name = make_safe_collection_name(user, collection, self.prefix)
# Check if collection exists
if self.client.has_collection(collection_name):
# Drop the collection
self.client.drop_collection(collection_name)
logger.info(f"Deleted Milvus collection: {collection_name}")
# Remove from our local cache
keys_to_remove = [key for key in self.collections.keys() if key[1] == user and key[2] == collection]
for key in keys_to_remove:
del self.collections[key]
else:
logger.info(f"Collection {collection_name} does not exist, nothing to delete")