trustgraph/trustgraph-flow/trustgraph/storage/triples/cassandra/write.py

310 lines
9.8 KiB
Python
Raw Normal View History

2024-07-10 23:20:06 +01:00
"""
2024-07-12 15:12:40 +01:00
Graph writer. Input is graph edge. Writes edges to Cassandra graph.
2024-07-10 23:20:06 +01:00
"""
import asyncio
2024-07-10 23:20:06 +01:00
import base64
import os
import argparse
import time
import logging
import json
2024-07-10 23:20:06 +01:00
from .... direct.cassandra_kg import (
EntityCentricKnowledgeGraph, DEFAULT_GRAPH
)
from .... base import TriplesStoreService, CollectionConfigHandler
from .... base import AsyncProcessor, Consumer, Producer
from .... base import ConsumerMetrics, ProducerMetrics
from .... base.cassandra_config import add_cassandra_args, resolve_cassandra_config
from .... schema import IRI, LITERAL, BLANK, TRIPLE
2024-07-10 23:20:06 +01:00
# Module logger
logger = logging.getLogger(__name__)
default_ident = "triples-write"
2024-07-15 17:17:04 +01:00
def serialize_triple(triple):
"""Serialize a Triple object to JSON for storage."""
if triple is None:
return None
def term_to_dict(term):
if term is None:
return None
result = {"type": term.type}
if term.type == IRI:
result["iri"] = term.iri
elif term.type == LITERAL:
result["value"] = term.value
if term.datatype:
result["datatype"] = term.datatype
if term.language:
result["language"] = term.language
elif term.type == BLANK:
result["id"] = term.id
elif term.type == TRIPLE:
result["triple"] = serialize_triple(term.triple)
return result
return json.dumps({
"s": term_to_dict(triple.s),
"p": term_to_dict(triple.p),
"o": term_to_dict(triple.o),
})
def get_term_value(term):
"""Extract the string value from a Term"""
if term is None:
return None
if term.type == IRI:
return term.iri
elif term.type == LITERAL:
return term.value
elif term.type == TRIPLE:
# Serialize nested triple as JSON
return serialize_triple(term.triple)
else:
# For blank nodes or other types, use id or value
return term.id or term.value
def get_term_otype(term):
"""
Get object type code from a Term for entity-centric storage.
Maps Term.type to otype:
- IRI ("i") "u" (URI)
- BLANK ("b") "u" (treated as URI)
- LITERAL ("l") "l" (Literal)
- TRIPLE ("t") "t" (Triple/reification)
"""
if term is None:
return "u"
if term.type == IRI or term.type == BLANK:
return "u"
elif term.type == LITERAL:
return "l"
elif term.type == TRIPLE:
return "t"
else:
return "u"
def get_term_dtype(term):
"""Extract datatype from a Term (for literals)"""
if term is None:
return ""
if term.type == LITERAL:
return term.datatype or ""
return ""
def get_term_lang(term):
"""Extract language tag from a Term (for literals)"""
if term is None:
return ""
if term.type == LITERAL:
return term.language or ""
return ""
class Processor(CollectionConfigHandler, TriplesStoreService):
2024-07-10 23:20:06 +01:00
def __init__(self, **params):
id = params.get("id", default_ident)
# Get Cassandra parameters
cassandra_host = params.get("cassandra_host")
cassandra_username = params.get("cassandra_username")
cassandra_password = params.get("cassandra_password")
# Resolve configuration with environment variable fallback
hosts, username, password, keyspace = resolve_cassandra_config(
host=cassandra_host,
username=cassandra_username,
password=cassandra_password
)
2024-07-10 23:20:06 +01:00
super(Processor, self).__init__(
**params | {
"cassandra_host": ','.join(hosts),
"cassandra_username": username
}
2024-07-10 23:20:06 +01:00
)
self.cassandra_host = hosts
self.cassandra_username = username
self.cassandra_password = password
self.table = None
self.tg = None
2024-07-10 23:20:06 +01:00
# Register for config push notifications
Config push notify pattern: replace stateful pub/sub with signal+ fetch (#760) Replace the config push mechanism that broadcast the full config blob on a 'state' class pub/sub queue with a lightweight notify signal containing only the version number and affected config types. Processors fetch the full config via request/response from the config service when notified. This eliminates the need for the pub/sub 'state' queue class and stateful pub/sub services entirely. The config push queue moves from 'state' to 'flow' class — a simple transient signal rather than a retained message. This solves the RabbitMQ late-subscriber problem where restarting processes never received the current config because their fresh queue had no historical messages. Key changes: - ConfigPush schema: config dict replaced with types list - Subscribe-then-fetch startup with retry: processors subscribe to notify queue, fetch config via request/response, then process buffered notifies with version comparison to avoid race conditions - register_config_handler() accepts optional types parameter so handlers only fire when their config types change - Short-lived config request/response clients to avoid subscriber contention on non-persistent response topics - Config service passes affected types through put/delete/flow operations - Gateway ConfigReceiver rewritten with same notify pattern and retry loop Tests updated New tests: - register_config_handler: without types, with types, multiple types, multiple handlers - on_config_notify: old/same version skipped, irrelevant types skipped (version still updated), relevant type triggers fetch, handler without types always called, mixed handler filtering, empty types invokes all, fetch failure handled gracefully - fetch_config: returns config+version, raises on error response, stops client even on exception - fetch_and_apply_config: applies to all handlers on startup, retries on failure
2026-04-06 16:57:27 +01:00
self.register_config_handler(self.on_collection_config, types=["collection"])
async def store_triples(self, message):
2024-07-10 23:20:06 +01:00
user = message.metadata.user
# The cassandra-driver work below — connection, schema
# setup, and per-triple inserts — is all synchronous.
# Wrap the whole batch in a worker thread so the event
# loop stays responsive for sibling processors when
# running in a processor group.
def _do_store():
if self.table is None or self.table != user:
self.tg = None
# Use factory function to select implementation
KGClass = EntityCentricKnowledgeGraph
try:
if self.cassandra_username and self.cassandra_password:
self.tg = KGClass(
hosts=self.cassandra_host,
keyspace=message.metadata.user,
username=self.cassandra_username,
password=self.cassandra_password,
)
else:
self.tg = KGClass(
hosts=self.cassandra_host,
keyspace=message.metadata.user,
)
except Exception as e:
logger.error(f"Exception: {e}", exc_info=True)
time.sleep(1)
raise e
self.table = user
for t in message.triples:
# Extract values from Term objects
s_val = get_term_value(t.s)
p_val = get_term_value(t.p)
o_val = get_term_value(t.o)
# t.g is None for default graph, or a graph IRI
g_val = t.g if t.g is not None else DEFAULT_GRAPH
# Extract object type metadata for entity-centric storage
otype = get_term_otype(t.o)
dtype = get_term_dtype(t.o)
lang = get_term_lang(t.o)
self.tg.insert(
message.metadata.collection,
s_val,
p_val,
o_val,
g=g_val,
otype=otype,
dtype=dtype,
lang=lang,
)
await asyncio.to_thread(_do_store)
2024-07-10 23:20:06 +01:00
async def create_collection(self, user: str, collection: str, metadata: dict):
"""Create a collection in Cassandra triple store via config push"""
def _do_create():
# Create or reuse connection for this user's keyspace
if self.table is None or self.table != user:
self.tg = None
# Use factory function to select implementation
KGClass = EntityCentricKnowledgeGraph
try:
if self.cassandra_username and self.cassandra_password:
self.tg = KGClass(
hosts=self.cassandra_host,
keyspace=user,
username=self.cassandra_username,
password=self.cassandra_password,
)
else:
self.tg = KGClass(
hosts=self.cassandra_host,
keyspace=user,
)
except Exception as e:
logger.error(f"Failed to connect to Cassandra for user {user}: {e}")
raise
self.table = user
# Create collection using the built-in method
logger.info(f"Creating collection {collection} for user {user}")
if self.tg.collection_exists(collection):
logger.info(f"Collection {collection} already exists")
else:
self.tg.create_collection(collection)
logger.info(f"Created collection {collection}")
try:
await asyncio.to_thread(_do_create)
except Exception as e:
logger.error(f"Failed to create collection {user}/{collection}: {e}", exc_info=True)
raise
async def delete_collection(self, user: str, collection: str):
"""Delete all data for a specific collection from the unified triples table"""
def _do_delete():
# Create or reuse connection for this user's keyspace
if self.table is None or self.table != user:
self.tg = None
# Use factory function to select implementation
KGClass = EntityCentricKnowledgeGraph
try:
if self.cassandra_username and self.cassandra_password:
self.tg = KGClass(
hosts=self.cassandra_host,
keyspace=user,
username=self.cassandra_username,
password=self.cassandra_password,
)
else:
self.tg = KGClass(
hosts=self.cassandra_host,
keyspace=user,
)
except Exception as e:
logger.error(f"Failed to connect to Cassandra for user {user}: {e}")
raise
self.table = user
# Delete all triples for this collection using the built-in method
self.tg.delete_collection(collection)
logger.info(f"Deleted all triples for collection {collection} from keyspace {user}")
try:
await asyncio.to_thread(_do_delete)
except Exception as e:
logger.error(f"Failed to delete collection {user}/{collection}: {e}", exc_info=True)
raise
@staticmethod
def add_args(parser):
2024-07-10 23:20:06 +01:00
TriplesStoreService.add_args(parser)
add_cassandra_args(parser)
2024-07-10 23:20:06 +01:00
def run():
Processor.launch(default_ident, __doc__)
2024-07-10 23:20:06 +01:00