Collection management (#520)

* Tech spec

* Refactored Cassanda knowledge graph for single table

* Collection management, librarian services to manage metadata and collection deletion
This commit is contained in:
cybermaggedon 2025-09-18 15:57:52 +01:00 committed by GitHub
parent 48016d8fb2
commit 13ff7d765d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
48 changed files with 2941 additions and 425 deletions

View file

@ -8,6 +8,7 @@ from . library import Library
from . flow import Flow
from . config import Config
from . knowledge import Knowledge
from . collection import Collection
from . exceptions import *
from . types import *
@ -68,3 +69,6 @@ class Api:
def library(self):
return Library(self)
def collection(self):
return Collection(self)

View file

@ -0,0 +1,90 @@
import datetime
import logging
from . types import CollectionMetadata
from . exceptions import *
logger = logging.getLogger(__name__)
class Collection:
def __init__(self, api):
self.api = api
def request(self, request):
return self.api.request(f"collection-management", request)
def list_collections(self, user, tag_filter=None):
input = {
"operation": "list-collections",
"user": user,
}
if tag_filter:
input["tag_filter"] = tag_filter
object = self.request(input)
try:
return [
CollectionMetadata(
user = v["user"],
collection = v["collection"],
name = v["name"],
description = v["description"],
tags = v["tags"],
created_at = v["created_at"],
updated_at = v["updated_at"]
)
for v in object["collections"]
]
except Exception as e:
logger.error("Failed to parse collection list response", exc_info=True)
raise ProtocolException(f"Response not formatted correctly")
def update_collection(self, user, collection, name=None, description=None, tags=None):
input = {
"operation": "update-collection",
"user": user,
"collection": collection,
}
if name is not None:
input["name"] = name
if description is not None:
input["description"] = description
if tags is not None:
input["tags"] = tags
object = self.request(input)
try:
if "collections" in object and object["collections"]:
v = object["collections"][0]
return CollectionMetadata(
user = v["user"],
collection = v["collection"],
name = v["name"],
description = v["description"],
tags = v["tags"],
created_at = v["created_at"],
updated_at = v["updated_at"]
)
return None
except Exception as e:
logger.error("Failed to parse collection update response", exc_info=True)
raise ProtocolException(f"Response not formatted correctly")
def delete_collection(self, user, collection):
input = {
"operation": "delete-collection",
"user": user,
"collection": collection,
}
object = self.request(input)
return {}

View file

@ -41,3 +41,13 @@ class ProcessingMetadata:
user : str
collection : str
tags : List[str]
@dataclasses.dataclass
class CollectionMetadata:
user : str
collection : str
name : str
description : str
tags : List[str]
created_at : str
updated_at : str

View file

@ -25,6 +25,7 @@ from .translators.objects_query import ObjectsQueryRequestTranslator, ObjectsQue
from .translators.nlp_query import QuestionToStructuredQueryRequestTranslator, QuestionToStructuredQueryResponseTranslator
from .translators.structured_query import StructuredQueryRequestTranslator, StructuredQueryResponseTranslator
from .translators.diagnosis import StructuredDataDiagnosisRequestTranslator, StructuredDataDiagnosisResponseTranslator
from .translators.collection import CollectionManagementRequestTranslator, CollectionManagementResponseTranslator
# Register all service translators
TranslatorRegistry.register_service(
@ -135,6 +136,12 @@ TranslatorRegistry.register_service(
StructuredDataDiagnosisResponseTranslator()
)
TranslatorRegistry.register_service(
"collection-management",
CollectionManagementRequestTranslator(),
CollectionManagementResponseTranslator()
)
# Register single-direction translators for document loading
TranslatorRegistry.register_request("document", DocumentTranslator())
TranslatorRegistry.register_request("text-document", TextDocumentTranslator())

View file

@ -0,0 +1,112 @@
from typing import Dict, Any, List
from ...schema import CollectionManagementRequest, CollectionManagementResponse, CollectionMetadata, Error
from .base import MessageTranslator
class CollectionManagementRequestTranslator(MessageTranslator):
"""Translator for CollectionManagementRequest schema objects"""
def to_pulsar(self, data: Dict[str, Any]) -> CollectionManagementRequest:
return CollectionManagementRequest(
operation=data.get("operation", ""),
user=data.get("user", ""),
collection=data.get("collection", ""),
timestamp=data.get("timestamp", ""),
name=data.get("name", ""),
description=data.get("description", ""),
tags=data.get("tags", []),
created_at=data.get("created_at", ""),
updated_at=data.get("updated_at", ""),
tag_filter=data.get("tag_filter", []),
limit=data.get("limit", 50)
)
def from_pulsar(self, obj: CollectionManagementRequest) -> Dict[str, Any]:
result = {}
if obj.operation:
result["operation"] = obj.operation
if obj.user:
result["user"] = obj.user
if obj.collection:
result["collection"] = obj.collection
if obj.timestamp:
result["timestamp"] = obj.timestamp
if obj.name:
result["name"] = obj.name
if obj.description:
result["description"] = obj.description
if obj.tags:
result["tags"] = list(obj.tags)
if obj.created_at:
result["created_at"] = obj.created_at
if obj.updated_at:
result["updated_at"] = obj.updated_at
if obj.tag_filter:
result["tag_filter"] = list(obj.tag_filter)
if obj.limit:
result["limit"] = obj.limit
return result
class CollectionManagementResponseTranslator(MessageTranslator):
"""Translator for CollectionManagementResponse schema objects"""
def to_pulsar(self, data: Dict[str, Any]) -> CollectionManagementResponse:
# Handle error
error = None
if "error" in data and data["error"]:
error_data = data["error"]
error = Error(
type=error_data.get("type", ""),
message=error_data.get("message", "")
)
# Handle collections array
collections = []
if "collections" in data:
for coll_data in data["collections"]:
collections.append(CollectionMetadata(
user=coll_data.get("user", ""),
collection=coll_data.get("collection", ""),
name=coll_data.get("name", ""),
description=coll_data.get("description", ""),
tags=coll_data.get("tags", []),
created_at=coll_data.get("created_at", ""),
updated_at=coll_data.get("updated_at", "")
))
return CollectionManagementResponse(
success=data.get("success", ""),
error=error,
timestamp=data.get("timestamp", ""),
collections=collections
)
def from_pulsar(self, obj: CollectionManagementResponse) -> Dict[str, Any]:
result = {}
if obj.success:
result["success"] = obj.success
if obj.error:
result["error"] = {
"type": obj.error.type,
"message": obj.error.message
}
if obj.timestamp:
result["timestamp"] = obj.timestamp
if obj.collections:
result["collections"] = []
for coll in obj.collections:
result["collections"].append({
"user": coll.user,
"collection": coll.collection,
"name": coll.name,
"description": coll.description,
"tags": list(coll.tags) if coll.tags else [],
"created_at": coll.created_at,
"updated_at": coll.updated_at
})
return result

View file

@ -10,4 +10,6 @@ from .lookup import *
from .nlp_query import *
from .structured_query import *
from .objects_query import *
from .diagnosis import *
from .diagnosis import *
from .collection import *
from .storage import *

View file

@ -0,0 +1,60 @@
from pulsar.schema import Record, String, Integer, Array
from datetime import datetime
from ..core.primitives import Error
from ..core.topic import topic
############################################################################
# Collection management operations
# Collection metadata operations (for librarian service)
class CollectionMetadata(Record):
"""Collection metadata record"""
user = String()
collection = String()
name = String()
description = String()
tags = Array(String())
created_at = String() # ISO timestamp
updated_at = String() # ISO timestamp
############################################################################
class CollectionManagementRequest(Record):
"""Request for collection management operations"""
operation = String() # e.g., "delete-collection"
# For 'list-collections'
user = String()
collection = String()
timestamp = String() # ISO timestamp
name = String()
description = String()
tags = Array(String())
created_at = String() # ISO timestamp
updated_at = String() # ISO timestamp
# For list
tag_filter = Array(String()) # Optional filter by tags
limit = Integer()
class CollectionManagementResponse(Record):
"""Response for collection management operations"""
success = String() # "true" or "false"
error = Error() # Only populated if success is "false"
timestamp = String() # ISO timestamp
collections = Array(CollectionMetadata())
############################################################################
# Topics
collection_request_queue = topic(
'collection', kind='non-persistent', namespace='request'
)
collection_response_queue = topic(
'collection', kind='non-persistent', namespace='response'
)

View file

@ -0,0 +1,42 @@
from pulsar.schema import Record, String
from ..core.primitives import Error
from ..core.topic import topic
############################################################################
# Storage management operations
class StorageManagementRequest(Record):
"""Request for storage management operations sent to store processors"""
operation = String() # e.g., "delete-collection"
user = String()
collection = String()
class StorageManagementResponse(Record):
"""Response from storage processors for management operations"""
error = Error() # Only populated if there's an error, if null success
############################################################################
# Storage management topics
# Topics for sending collection management requests to different storage types
vector_storage_management_topic = topic(
'vector-storage-management', kind='non-persistent', namespace='request'
)
object_storage_management_topic = topic(
'object-storage-management', kind='non-persistent', namespace='request'
)
triples_storage_management_topic = topic(
'triples-storage-management', kind='non-persistent', namespace='request'
)
# Topic for receiving responses from storage processors
storage_management_response_topic = topic(
'storage-management', kind='non-persistent', namespace='response'
)
############################################################################