trustgraph/trustgraph-flow/trustgraph/storage/triples/cassandra/write.py
cybermaggedon 2bf4af294e
Better proc group logging and concurrency (#810)
- Silence pika, cassandra etc. logging at INFO (too much chatter) 
- Add per processor log tags so that logs can be understood in
  processor group.
- Deal with RabbitMQ lag weirdness
- Added more processor group examples
2026-04-15 14:52:01 +01:00

309 lines
9.8 KiB
Python
Executable file

"""
Graph writer. Input is graph edge. Writes edges to Cassandra graph.
"""
import asyncio
import base64
import os
import argparse
import time
import logging
import json
from .... direct.cassandra_kg import (
EntityCentricKnowledgeGraph, DEFAULT_GRAPH
)
from .... base import TriplesStoreService, CollectionConfigHandler
from .... base import AsyncProcessor, Consumer, Producer
from .... base import ConsumerMetrics, ProducerMetrics
from .... base.cassandra_config import add_cassandra_args, resolve_cassandra_config
from .... schema import IRI, LITERAL, BLANK, TRIPLE
# Module logger
logger = logging.getLogger(__name__)
default_ident = "triples-write"
def serialize_triple(triple):
"""Serialize a Triple object to JSON for storage."""
if triple is None:
return None
def term_to_dict(term):
if term is None:
return None
result = {"type": term.type}
if term.type == IRI:
result["iri"] = term.iri
elif term.type == LITERAL:
result["value"] = term.value
if term.datatype:
result["datatype"] = term.datatype
if term.language:
result["language"] = term.language
elif term.type == BLANK:
result["id"] = term.id
elif term.type == TRIPLE:
result["triple"] = serialize_triple(term.triple)
return result
return json.dumps({
"s": term_to_dict(triple.s),
"p": term_to_dict(triple.p),
"o": term_to_dict(triple.o),
})
def get_term_value(term):
"""Extract the string value from a Term"""
if term is None:
return None
if term.type == IRI:
return term.iri
elif term.type == LITERAL:
return term.value
elif term.type == TRIPLE:
# Serialize nested triple as JSON
return serialize_triple(term.triple)
else:
# For blank nodes or other types, use id or value
return term.id or term.value
def get_term_otype(term):
"""
Get object type code from a Term for entity-centric storage.
Maps Term.type to otype:
- IRI ("i") → "u" (URI)
- BLANK ("b") → "u" (treated as URI)
- LITERAL ("l") → "l" (Literal)
- TRIPLE ("t") → "t" (Triple/reification)
"""
if term is None:
return "u"
if term.type == IRI or term.type == BLANK:
return "u"
elif term.type == LITERAL:
return "l"
elif term.type == TRIPLE:
return "t"
else:
return "u"
def get_term_dtype(term):
"""Extract datatype from a Term (for literals)"""
if term is None:
return ""
if term.type == LITERAL:
return term.datatype or ""
return ""
def get_term_lang(term):
"""Extract language tag from a Term (for literals)"""
if term is None:
return ""
if term.type == LITERAL:
return term.language or ""
return ""
class Processor(CollectionConfigHandler, TriplesStoreService):
def __init__(self, **params):
id = params.get("id", default_ident)
# Get Cassandra parameters
cassandra_host = params.get("cassandra_host")
cassandra_username = params.get("cassandra_username")
cassandra_password = params.get("cassandra_password")
# Resolve configuration with environment variable fallback
hosts, username, password, keyspace = resolve_cassandra_config(
host=cassandra_host,
username=cassandra_username,
password=cassandra_password
)
super(Processor, self).__init__(
**params | {
"cassandra_host": ','.join(hosts),
"cassandra_username": username
}
)
self.cassandra_host = hosts
self.cassandra_username = username
self.cassandra_password = password
self.table = None
self.tg = None
# Register for config push notifications
self.register_config_handler(self.on_collection_config, types=["collection"])
async def store_triples(self, message):
user = message.metadata.user
# The cassandra-driver work below — connection, schema
# setup, and per-triple inserts — is all synchronous.
# Wrap the whole batch in a worker thread so the event
# loop stays responsive for sibling processors when
# running in a processor group.
def _do_store():
if self.table is None or self.table != user:
self.tg = None
# Use factory function to select implementation
KGClass = EntityCentricKnowledgeGraph
try:
if self.cassandra_username and self.cassandra_password:
self.tg = KGClass(
hosts=self.cassandra_host,
keyspace=message.metadata.user,
username=self.cassandra_username,
password=self.cassandra_password,
)
else:
self.tg = KGClass(
hosts=self.cassandra_host,
keyspace=message.metadata.user,
)
except Exception as e:
logger.error(f"Exception: {e}", exc_info=True)
time.sleep(1)
raise e
self.table = user
for t in message.triples:
# Extract values from Term objects
s_val = get_term_value(t.s)
p_val = get_term_value(t.p)
o_val = get_term_value(t.o)
# t.g is None for default graph, or a graph IRI
g_val = t.g if t.g is not None else DEFAULT_GRAPH
# Extract object type metadata for entity-centric storage
otype = get_term_otype(t.o)
dtype = get_term_dtype(t.o)
lang = get_term_lang(t.o)
self.tg.insert(
message.metadata.collection,
s_val,
p_val,
o_val,
g=g_val,
otype=otype,
dtype=dtype,
lang=lang,
)
await asyncio.to_thread(_do_store)
async def create_collection(self, user: str, collection: str, metadata: dict):
"""Create a collection in Cassandra triple store via config push"""
def _do_create():
# Create or reuse connection for this user's keyspace
if self.table is None or self.table != user:
self.tg = None
# Use factory function to select implementation
KGClass = EntityCentricKnowledgeGraph
try:
if self.cassandra_username and self.cassandra_password:
self.tg = KGClass(
hosts=self.cassandra_host,
keyspace=user,
username=self.cassandra_username,
password=self.cassandra_password,
)
else:
self.tg = KGClass(
hosts=self.cassandra_host,
keyspace=user,
)
except Exception as e:
logger.error(f"Failed to connect to Cassandra for user {user}: {e}")
raise
self.table = user
# Create collection using the built-in method
logger.info(f"Creating collection {collection} for user {user}")
if self.tg.collection_exists(collection):
logger.info(f"Collection {collection} already exists")
else:
self.tg.create_collection(collection)
logger.info(f"Created collection {collection}")
try:
await asyncio.to_thread(_do_create)
except Exception as e:
logger.error(f"Failed to create collection {user}/{collection}: {e}", exc_info=True)
raise
async def delete_collection(self, user: str, collection: str):
"""Delete all data for a specific collection from the unified triples table"""
def _do_delete():
# Create or reuse connection for this user's keyspace
if self.table is None or self.table != user:
self.tg = None
# Use factory function to select implementation
KGClass = EntityCentricKnowledgeGraph
try:
if self.cassandra_username and self.cassandra_password:
self.tg = KGClass(
hosts=self.cassandra_host,
keyspace=user,
username=self.cassandra_username,
password=self.cassandra_password,
)
else:
self.tg = KGClass(
hosts=self.cassandra_host,
keyspace=user,
)
except Exception as e:
logger.error(f"Failed to connect to Cassandra for user {user}: {e}")
raise
self.table = user
# Delete all triples for this collection using the built-in method
self.tg.delete_collection(collection)
logger.info(f"Deleted all triples for collection {collection} from keyspace {user}")
try:
await asyncio.to_thread(_do_delete)
except Exception as e:
logger.error(f"Failed to delete collection {user}/{collection}: {e}", exc_info=True)
raise
@staticmethod
def add_args(parser):
TriplesStoreService.add_args(parser)
add_cassandra_args(parser)
def run():
Processor.launch(default_ident, __doc__)