2024-07-10 23:20:06 +01:00
|
|
|
|
|
|
|
|
"""
|
2024-07-12 15:12:40 +01:00
|
|
|
Graph writer. Input is graph edge. Writes edges to Cassandra graph.
|
2024-07-10 23:20:06 +01:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import pulsar
|
|
|
|
|
import base64
|
|
|
|
|
import os
|
|
|
|
|
import argparse
|
|
|
|
|
import time
|
2025-07-30 23:18:38 +01:00
|
|
|
import logging
|
2024-07-10 23:20:06 +01:00
|
|
|
|
2025-09-18 15:57:52 +01:00
|
|
|
from .... direct.cassandra_kg import KnowledgeGraph
|
2025-04-22 20:21:38 +01:00
|
|
|
from .... base import TriplesStoreService
|
2025-09-18 15:57:52 +01:00
|
|
|
from .... base import AsyncProcessor, Consumer, Producer
|
|
|
|
|
from .... base import ConsumerMetrics, ProducerMetrics
|
2025-09-03 23:41:22 +01:00
|
|
|
from .... base.cassandra_config import add_cassandra_args, resolve_cassandra_config
|
2025-09-18 15:57:52 +01:00
|
|
|
from .... schema import StorageManagementRequest, StorageManagementResponse, Error
|
|
|
|
|
from .... schema import triples_storage_management_topic, storage_management_response_topic
|
2024-07-10 23:20:06 +01:00
|
|
|
|
2025-07-30 23:18:38 +01:00
|
|
|
# Module logger
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
default_ident = "triples-write"
|
2024-07-23 21:34:03 +01:00
|
|
|
|
2024-07-15 17:17:04 +01:00
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
class Processor(TriplesStoreService):
|
2024-07-10 23:20:06 +01:00
|
|
|
|
2024-07-18 17:20:42 +01:00
|
|
|
def __init__(self, **params):
|
|
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
id = params.get("id", default_ident)
|
|
|
|
|
|
2025-09-04 00:58:11 +01:00
|
|
|
# Get Cassandra parameters
|
|
|
|
|
cassandra_host = params.get("cassandra_host")
|
|
|
|
|
cassandra_username = params.get("cassandra_username")
|
|
|
|
|
cassandra_password = params.get("cassandra_password")
|
2025-09-03 23:41:22 +01:00
|
|
|
|
|
|
|
|
# Resolve configuration with environment variable fallback
|
|
|
|
|
hosts, username, password = resolve_cassandra_config(
|
|
|
|
|
host=cassandra_host,
|
|
|
|
|
username=cassandra_username,
|
|
|
|
|
password=cassandra_password
|
|
|
|
|
)
|
2024-07-10 23:20:06 +01:00
|
|
|
|
2024-07-17 16:56:47 +01:00
|
|
|
super(Processor, self).__init__(
|
2024-07-18 17:20:42 +01:00
|
|
|
**params | {
|
2025-09-03 23:41:22 +01:00
|
|
|
"cassandra_host": ','.join(hosts),
|
|
|
|
|
"cassandra_username": username
|
2024-07-18 17:20:42 +01:00
|
|
|
}
|
2024-07-10 23:20:06 +01:00
|
|
|
)
|
2025-02-08 11:39:23 +00:00
|
|
|
|
2025-09-04 00:58:11 +01:00
|
|
|
self.cassandra_host = hosts
|
|
|
|
|
self.cassandra_username = username
|
|
|
|
|
self.cassandra_password = password
|
2024-10-02 18:14:29 +01:00
|
|
|
self.table = None
|
2024-07-10 23:20:06 +01:00
|
|
|
|
2025-09-18 15:57:52 +01:00
|
|
|
# Set up metrics for storage management
|
|
|
|
|
storage_request_metrics = ConsumerMetrics(
|
|
|
|
|
processor=self.id, flow=None, name="storage-request"
|
|
|
|
|
)
|
|
|
|
|
storage_response_metrics = ProducerMetrics(
|
|
|
|
|
processor=self.id, flow=None, name="storage-response"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Set up consumer for storage management requests
|
|
|
|
|
self.storage_request_consumer = Consumer(
|
|
|
|
|
taskgroup=self.taskgroup,
|
|
|
|
|
client=self.pulsar_client,
|
|
|
|
|
flow=None,
|
|
|
|
|
topic=triples_storage_management_topic,
|
|
|
|
|
subscriber=f"{id}-storage",
|
|
|
|
|
schema=StorageManagementRequest,
|
|
|
|
|
handler=self.on_storage_management,
|
|
|
|
|
metrics=storage_request_metrics,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Set up producer for storage management responses
|
|
|
|
|
self.storage_response_producer = Producer(
|
|
|
|
|
client=self.pulsar_client,
|
|
|
|
|
topic=storage_management_response_topic,
|
|
|
|
|
schema=StorageManagementResponse,
|
|
|
|
|
metrics=storage_response_metrics,
|
|
|
|
|
)
|
|
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
async def store_triples(self, message):
|
2024-07-10 23:20:06 +01:00
|
|
|
|
2025-09-18 15:57:52 +01:00
|
|
|
user = message.metadata.user
|
2024-10-02 18:14:29 +01:00
|
|
|
|
2025-09-18 15:57:52 +01:00
|
|
|
if self.table is None or self.table != user:
|
2024-10-02 18:14:29 +01:00
|
|
|
|
|
|
|
|
self.tg = None
|
|
|
|
|
|
|
|
|
|
try:
|
2025-09-04 00:58:11 +01:00
|
|
|
if self.cassandra_username and self.cassandra_password:
|
2025-09-18 15:57:52 +01:00
|
|
|
self.tg = KnowledgeGraph(
|
2025-09-04 00:58:11 +01:00
|
|
|
hosts=self.cassandra_host,
|
2025-04-22 20:21:38 +01:00
|
|
|
keyspace=message.metadata.user,
|
2025-09-04 00:58:11 +01:00
|
|
|
username=self.cassandra_username, password=self.cassandra_password
|
2025-02-08 11:39:23 +00:00
|
|
|
)
|
|
|
|
|
else:
|
2025-09-18 15:57:52 +01:00
|
|
|
self.tg = KnowledgeGraph(
|
2025-09-04 00:58:11 +01:00
|
|
|
hosts=self.cassandra_host,
|
2025-04-22 20:21:38 +01:00
|
|
|
keyspace=message.metadata.user,
|
2025-02-08 11:39:23 +00:00
|
|
|
)
|
2024-10-02 18:14:29 +01:00
|
|
|
except Exception as e:
|
2025-07-30 23:18:38 +01:00
|
|
|
logger.error(f"Exception: {e}", exc_info=True)
|
2024-10-02 18:14:29 +01:00
|
|
|
time.sleep(1)
|
|
|
|
|
raise e
|
|
|
|
|
|
2025-09-18 15:57:52 +01:00
|
|
|
self.table = user
|
2024-10-23 18:04:04 +01:00
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
for t in message.triples:
|
2024-10-23 18:04:04 +01:00
|
|
|
self.tg.insert(
|
2025-09-18 15:57:52 +01:00
|
|
|
message.metadata.collection,
|
2024-10-23 18:04:04 +01:00
|
|
|
t.s.value,
|
|
|
|
|
t.p.value,
|
|
|
|
|
t.o.value
|
|
|
|
|
)
|
2024-07-10 23:20:06 +01:00
|
|
|
|
2025-09-18 15:57:52 +01:00
|
|
|
async def on_storage_management(self, message):
|
|
|
|
|
"""Handle storage management requests"""
|
|
|
|
|
logger.info(f"Storage management request: {message.operation} for {message.user}/{message.collection}")
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
if message.operation == "delete-collection":
|
|
|
|
|
await self.handle_delete_collection(message)
|
|
|
|
|
else:
|
|
|
|
|
response = StorageManagementResponse(
|
|
|
|
|
error=Error(
|
|
|
|
|
type="invalid_operation",
|
|
|
|
|
message=f"Unknown operation: {message.operation}"
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
await self.storage_response_producer.send(response)
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Error processing storage management request: {e}", exc_info=True)
|
|
|
|
|
response = StorageManagementResponse(
|
|
|
|
|
error=Error(
|
|
|
|
|
type="processing_error",
|
|
|
|
|
message=str(e)
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
await self.storage_response_producer.send(response)
|
|
|
|
|
|
|
|
|
|
async def handle_delete_collection(self, message):
|
|
|
|
|
"""Delete all data for a specific collection from the unified triples table"""
|
|
|
|
|
try:
|
|
|
|
|
# Create or reuse connection for this user's keyspace
|
|
|
|
|
if self.table is None or self.table != message.user:
|
|
|
|
|
self.tg = None
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
if self.cassandra_username and self.cassandra_password:
|
|
|
|
|
self.tg = KnowledgeGraph(
|
|
|
|
|
hosts=self.cassandra_host,
|
|
|
|
|
keyspace=message.user,
|
|
|
|
|
username=self.cassandra_username,
|
|
|
|
|
password=self.cassandra_password
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
self.tg = KnowledgeGraph(
|
|
|
|
|
hosts=self.cassandra_host,
|
|
|
|
|
keyspace=message.user,
|
|
|
|
|
)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Failed to connect to Cassandra for user {message.user}: {e}")
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
self.table = message.user
|
|
|
|
|
|
|
|
|
|
# Delete all triples for this collection from the unified table
|
|
|
|
|
# In the unified table schema, collection is the partition key
|
|
|
|
|
delete_cql = """
|
|
|
|
|
DELETE FROM triples
|
|
|
|
|
WHERE collection = ?
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
self.tg.session.execute(delete_cql, (message.collection,))
|
|
|
|
|
logger.info(f"Deleted all triples for collection {message.collection} from keyspace {message.user}")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Failed to delete collection data: {e}")
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
# Send success response
|
|
|
|
|
response = StorageManagementResponse(
|
|
|
|
|
error=None # No error means success
|
|
|
|
|
)
|
|
|
|
|
await self.storage_response_producer.send(response)
|
|
|
|
|
logger.info(f"Successfully deleted collection {message.user}/{message.collection}")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Failed to delete collection: {e}")
|
|
|
|
|
raise
|
|
|
|
|
|
2024-07-17 16:56:47 +01:00
|
|
|
@staticmethod
|
|
|
|
|
def add_args(parser):
|
2024-07-10 23:20:06 +01:00
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
TriplesStoreService.add_args(parser)
|
2025-09-03 23:41:22 +01:00
|
|
|
add_cassandra_args(parser)
|
2024-07-10 23:20:06 +01:00
|
|
|
|
|
|
|
|
def run():
|
|
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
Processor.launch(default_ident, __doc__)
|
2024-07-10 23:20:06 +01:00
|
|
|
|