2024-07-10 23:20:06 +01:00
|
|
|
|
|
|
|
|
"""
|
2024-07-12 15:12:40 +01:00
|
|
|
Graph writer. Input is graph edge. Writes edges to Cassandra graph.
|
2024-07-10 23:20:06 +01:00
|
|
|
"""
|
|
|
|
|
|
2026-04-15 14:52:01 +01:00
|
|
|
import asyncio
|
2024-07-10 23:20:06 +01:00
|
|
|
import base64
|
|
|
|
|
import os
|
|
|
|
|
import argparse
|
|
|
|
|
import time
|
2025-07-30 23:18:38 +01:00
|
|
|
import logging
|
2026-03-06 12:23:58 +00:00
|
|
|
import json
|
2024-07-10 23:20:06 +01:00
|
|
|
|
2026-02-16 13:26:43 +00:00
|
|
|
from .... direct.cassandra_kg import (
|
|
|
|
|
EntityCentricKnowledgeGraph, DEFAULT_GRAPH
|
|
|
|
|
)
|
2025-12-05 21:45:30 +00:00
|
|
|
from .... base import TriplesStoreService, CollectionConfigHandler
|
2025-09-18 15:57:52 +01:00
|
|
|
from .... base import AsyncProcessor, Consumer, Producer
|
|
|
|
|
from .... base import ConsumerMetrics, ProducerMetrics
|
2025-09-03 23:41:22 +01:00
|
|
|
from .... base.cassandra_config import add_cassandra_args, resolve_cassandra_config
|
2026-02-16 13:26:43 +00:00
|
|
|
from .... schema import IRI, LITERAL, BLANK, TRIPLE
|
2024-07-10 23:20:06 +01:00
|
|
|
|
2025-07-30 23:18:38 +01:00
|
|
|
# Module logger
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
default_ident = "triples-write"
|
2024-07-23 21:34:03 +01:00
|
|
|
|
2024-07-15 17:17:04 +01:00
|
|
|
|
2026-03-06 12:23:58 +00:00
|
|
|
def serialize_triple(triple):
|
|
|
|
|
"""Serialize a Triple object to JSON for storage."""
|
|
|
|
|
if triple is None:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def term_to_dict(term):
|
|
|
|
|
if term is None:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
result = {"type": term.type}
|
|
|
|
|
if term.type == IRI:
|
|
|
|
|
result["iri"] = term.iri
|
|
|
|
|
elif term.type == LITERAL:
|
|
|
|
|
result["value"] = term.value
|
|
|
|
|
if term.datatype:
|
|
|
|
|
result["datatype"] = term.datatype
|
|
|
|
|
if term.language:
|
|
|
|
|
result["language"] = term.language
|
|
|
|
|
elif term.type == BLANK:
|
|
|
|
|
result["id"] = term.id
|
|
|
|
|
elif term.type == TRIPLE:
|
|
|
|
|
result["triple"] = serialize_triple(term.triple)
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
return json.dumps({
|
|
|
|
|
"s": term_to_dict(triple.s),
|
|
|
|
|
"p": term_to_dict(triple.p),
|
|
|
|
|
"o": term_to_dict(triple.o),
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
2026-01-27 13:48:08 +00:00
|
|
|
def get_term_value(term):
|
|
|
|
|
"""Extract the string value from a Term"""
|
|
|
|
|
if term is None:
|
|
|
|
|
return None
|
|
|
|
|
if term.type == IRI:
|
|
|
|
|
return term.iri
|
|
|
|
|
elif term.type == LITERAL:
|
|
|
|
|
return term.value
|
2026-03-06 12:23:58 +00:00
|
|
|
elif term.type == TRIPLE:
|
|
|
|
|
# Serialize nested triple as JSON
|
|
|
|
|
return serialize_triple(term.triple)
|
2026-01-27 13:48:08 +00:00
|
|
|
else:
|
|
|
|
|
# For blank nodes or other types, use id or value
|
|
|
|
|
return term.id or term.value
|
|
|
|
|
|
|
|
|
|
|
2026-02-16 13:26:43 +00:00
|
|
|
def get_term_otype(term):
|
|
|
|
|
"""
|
|
|
|
|
Get object type code from a Term for entity-centric storage.
|
|
|
|
|
|
|
|
|
|
Maps Term.type to otype:
|
|
|
|
|
- IRI ("i") → "u" (URI)
|
|
|
|
|
- BLANK ("b") → "u" (treated as URI)
|
|
|
|
|
- LITERAL ("l") → "l" (Literal)
|
|
|
|
|
- TRIPLE ("t") → "t" (Triple/reification)
|
|
|
|
|
"""
|
|
|
|
|
if term is None:
|
|
|
|
|
return "u"
|
|
|
|
|
if term.type == IRI or term.type == BLANK:
|
|
|
|
|
return "u"
|
|
|
|
|
elif term.type == LITERAL:
|
|
|
|
|
return "l"
|
|
|
|
|
elif term.type == TRIPLE:
|
|
|
|
|
return "t"
|
|
|
|
|
else:
|
|
|
|
|
return "u"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_term_dtype(term):
|
|
|
|
|
"""Extract datatype from a Term (for literals)"""
|
|
|
|
|
if term is None:
|
|
|
|
|
return ""
|
|
|
|
|
if term.type == LITERAL:
|
|
|
|
|
return term.datatype or ""
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_term_lang(term):
|
|
|
|
|
"""Extract language tag from a Term (for literals)"""
|
|
|
|
|
if term is None:
|
|
|
|
|
return ""
|
|
|
|
|
if term.type == LITERAL:
|
|
|
|
|
return term.language or ""
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
2025-12-05 21:45:30 +00:00
|
|
|
class Processor(CollectionConfigHandler, TriplesStoreService):
|
2024-07-10 23:20:06 +01:00
|
|
|
|
2024-07-18 17:20:42 +01:00
|
|
|
def __init__(self, **params):
|
2025-12-05 21:45:30 +00:00
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
id = params.get("id", default_ident)
|
|
|
|
|
|
2025-09-04 00:58:11 +01:00
|
|
|
# Get Cassandra parameters
|
|
|
|
|
cassandra_host = params.get("cassandra_host")
|
|
|
|
|
cassandra_username = params.get("cassandra_username")
|
|
|
|
|
cassandra_password = params.get("cassandra_password")
|
2025-09-03 23:41:22 +01:00
|
|
|
|
|
|
|
|
# Resolve configuration with environment variable fallback
|
2025-12-05 21:45:30 +00:00
|
|
|
hosts, username, password, keyspace = resolve_cassandra_config(
|
2025-09-03 23:41:22 +01:00
|
|
|
host=cassandra_host,
|
|
|
|
|
username=cassandra_username,
|
|
|
|
|
password=cassandra_password
|
|
|
|
|
)
|
2024-07-10 23:20:06 +01:00
|
|
|
|
2024-07-17 16:56:47 +01:00
|
|
|
super(Processor, self).__init__(
|
2024-07-18 17:20:42 +01:00
|
|
|
**params | {
|
2025-09-03 23:41:22 +01:00
|
|
|
"cassandra_host": ','.join(hosts),
|
|
|
|
|
"cassandra_username": username
|
2024-07-18 17:20:42 +01:00
|
|
|
}
|
2024-07-10 23:20:06 +01:00
|
|
|
)
|
2025-12-05 21:45:30 +00:00
|
|
|
|
2025-09-04 00:58:11 +01:00
|
|
|
self.cassandra_host = hosts
|
|
|
|
|
self.cassandra_username = username
|
|
|
|
|
self.cassandra_password = password
|
2024-10-02 18:14:29 +01:00
|
|
|
self.table = None
|
2025-12-05 21:45:30 +00:00
|
|
|
self.tg = None
|
2024-07-10 23:20:06 +01:00
|
|
|
|
2025-12-05 21:45:30 +00:00
|
|
|
# Register for config push notifications
|
Config push notify pattern: replace stateful pub/sub with signal+ fetch (#760)
Replace the config push mechanism that broadcast the full config
blob on a 'state' class pub/sub queue with a lightweight notify
signal containing only the version number and affected config
types. Processors fetch the full config via request/response from
the config service when notified.
This eliminates the need for the pub/sub 'state' queue class and
stateful pub/sub services entirely. The config push queue moves
from 'state' to 'flow' class — a simple transient signal rather
than a retained message. This solves the RabbitMQ
late-subscriber problem where restarting processes never received
the current config because their fresh queue had no historical
messages.
Key changes:
- ConfigPush schema: config dict replaced with types list
- Subscribe-then-fetch startup with retry: processors subscribe
to notify queue, fetch config via request/response, then
process buffered notifies with version comparison to avoid race
conditions
- register_config_handler() accepts optional types parameter so
handlers only fire when their config types change
- Short-lived config request/response clients to avoid subscriber
contention on non-persistent response topics
- Config service passes affected types through put/delete/flow
operations
- Gateway ConfigReceiver rewritten with same notify pattern and
retry loop
Tests updated
New tests:
- register_config_handler: without types, with types, multiple
types, multiple handlers
- on_config_notify: old/same version skipped, irrelevant types
skipped (version still updated), relevant type triggers fetch,
handler without types always called, mixed handler filtering,
empty types invokes all, fetch failure handled gracefully
- fetch_config: returns config+version, raises on error response,
stops client even on exception
- fetch_and_apply_config: applies to all handlers on startup,
retries on failure
2026-04-06 16:57:27 +01:00
|
|
|
self.register_config_handler(self.on_collection_config, types=["collection"])
|
2025-09-18 15:57:52 +01:00
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
async def store_triples(self, message):
|
2024-07-10 23:20:06 +01:00
|
|
|
|
2025-09-18 15:57:52 +01:00
|
|
|
user = message.metadata.user
|
2024-10-02 18:14:29 +01:00
|
|
|
|
2026-04-15 14:52:01 +01:00
|
|
|
# The cassandra-driver work below — connection, schema
|
|
|
|
|
# setup, and per-triple inserts — is all synchronous.
|
|
|
|
|
# Wrap the whole batch in a worker thread so the event
|
|
|
|
|
# loop stays responsive for sibling processors when
|
|
|
|
|
# running in a processor group.
|
|
|
|
|
|
|
|
|
|
def _do_store():
|
|
|
|
|
|
|
|
|
|
if self.table is None or self.table != user:
|
|
|
|
|
|
|
|
|
|
self.tg = None
|
|
|
|
|
|
|
|
|
|
# Use factory function to select implementation
|
|
|
|
|
KGClass = EntityCentricKnowledgeGraph
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
if self.cassandra_username and self.cassandra_password:
|
|
|
|
|
self.tg = KGClass(
|
|
|
|
|
hosts=self.cassandra_host,
|
|
|
|
|
keyspace=message.metadata.user,
|
|
|
|
|
username=self.cassandra_username,
|
|
|
|
|
password=self.cassandra_password,
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
self.tg = KGClass(
|
|
|
|
|
hosts=self.cassandra_host,
|
|
|
|
|
keyspace=message.metadata.user,
|
|
|
|
|
)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Exception: {e}", exc_info=True)
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
raise e
|
|
|
|
|
|
|
|
|
|
self.table = user
|
|
|
|
|
|
|
|
|
|
for t in message.triples:
|
|
|
|
|
# Extract values from Term objects
|
|
|
|
|
s_val = get_term_value(t.s)
|
|
|
|
|
p_val = get_term_value(t.p)
|
|
|
|
|
o_val = get_term_value(t.o)
|
|
|
|
|
# t.g is None for default graph, or a graph IRI
|
|
|
|
|
g_val = t.g if t.g is not None else DEFAULT_GRAPH
|
|
|
|
|
|
|
|
|
|
# Extract object type metadata for entity-centric storage
|
|
|
|
|
otype = get_term_otype(t.o)
|
|
|
|
|
dtype = get_term_dtype(t.o)
|
|
|
|
|
lang = get_term_lang(t.o)
|
|
|
|
|
|
|
|
|
|
self.tg.insert(
|
|
|
|
|
message.metadata.collection,
|
|
|
|
|
s_val,
|
|
|
|
|
p_val,
|
|
|
|
|
o_val,
|
|
|
|
|
g=g_val,
|
|
|
|
|
otype=otype,
|
|
|
|
|
dtype=dtype,
|
|
|
|
|
lang=lang,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
await asyncio.to_thread(_do_store)
|
2024-07-10 23:20:06 +01:00
|
|
|
|
2025-12-05 21:45:30 +00:00
|
|
|
async def create_collection(self, user: str, collection: str, metadata: dict):
|
|
|
|
|
"""Create a collection in Cassandra triple store via config push"""
|
2026-04-15 14:52:01 +01:00
|
|
|
|
|
|
|
|
def _do_create():
|
2025-09-18 15:57:52 +01:00
|
|
|
# Create or reuse connection for this user's keyspace
|
2025-12-05 21:45:30 +00:00
|
|
|
if self.table is None or self.table != user:
|
2025-09-18 15:57:52 +01:00
|
|
|
self.tg = None
|
|
|
|
|
|
2026-02-16 13:26:43 +00:00
|
|
|
# Use factory function to select implementation
|
|
|
|
|
KGClass = EntityCentricKnowledgeGraph
|
|
|
|
|
|
2025-09-18 15:57:52 +01:00
|
|
|
try:
|
|
|
|
|
if self.cassandra_username and self.cassandra_password:
|
2026-02-16 13:26:43 +00:00
|
|
|
self.tg = KGClass(
|
2025-09-18 15:57:52 +01:00
|
|
|
hosts=self.cassandra_host,
|
2025-12-05 21:45:30 +00:00
|
|
|
keyspace=user,
|
2025-09-18 15:57:52 +01:00
|
|
|
username=self.cassandra_username,
|
2026-04-15 14:52:01 +01:00
|
|
|
password=self.cassandra_password,
|
2025-09-18 15:57:52 +01:00
|
|
|
)
|
|
|
|
|
else:
|
2026-02-16 13:26:43 +00:00
|
|
|
self.tg = KGClass(
|
2025-09-18 15:57:52 +01:00
|
|
|
hosts=self.cassandra_host,
|
2025-12-05 21:45:30 +00:00
|
|
|
keyspace=user,
|
2025-09-18 15:57:52 +01:00
|
|
|
)
|
|
|
|
|
except Exception as e:
|
2025-12-05 21:45:30 +00:00
|
|
|
logger.error(f"Failed to connect to Cassandra for user {user}: {e}")
|
2025-09-18 15:57:52 +01:00
|
|
|
raise
|
|
|
|
|
|
2025-12-05 21:45:30 +00:00
|
|
|
self.table = user
|
2025-09-30 16:02:33 +01:00
|
|
|
|
|
|
|
|
# Create collection using the built-in method
|
2025-12-05 21:45:30 +00:00
|
|
|
logger.info(f"Creating collection {collection} for user {user}")
|
2025-09-30 16:02:33 +01:00
|
|
|
|
2025-12-05 21:45:30 +00:00
|
|
|
if self.tg.collection_exists(collection):
|
|
|
|
|
logger.info(f"Collection {collection} already exists")
|
2025-09-30 16:02:33 +01:00
|
|
|
else:
|
2025-12-05 21:45:30 +00:00
|
|
|
self.tg.create_collection(collection)
|
|
|
|
|
logger.info(f"Created collection {collection}")
|
2025-09-30 16:02:33 +01:00
|
|
|
|
2026-04-15 14:52:01 +01:00
|
|
|
try:
|
|
|
|
|
await asyncio.to_thread(_do_create)
|
2025-09-30 16:02:33 +01:00
|
|
|
except Exception as e:
|
2025-12-05 21:45:30 +00:00
|
|
|
logger.error(f"Failed to create collection {user}/{collection}: {e}", exc_info=True)
|
|
|
|
|
raise
|
2025-09-30 16:02:33 +01:00
|
|
|
|
2025-12-05 21:45:30 +00:00
|
|
|
async def delete_collection(self, user: str, collection: str):
|
2025-09-30 16:02:33 +01:00
|
|
|
"""Delete all data for a specific collection from the unified triples table"""
|
2026-04-15 14:52:01 +01:00
|
|
|
|
|
|
|
|
def _do_delete():
|
2025-09-30 16:02:33 +01:00
|
|
|
# Create or reuse connection for this user's keyspace
|
2025-12-05 21:45:30 +00:00
|
|
|
if self.table is None or self.table != user:
|
2025-09-30 16:02:33 +01:00
|
|
|
self.tg = None
|
|
|
|
|
|
2026-02-16 13:26:43 +00:00
|
|
|
# Use factory function to select implementation
|
|
|
|
|
KGClass = EntityCentricKnowledgeGraph
|
|
|
|
|
|
2025-09-30 16:02:33 +01:00
|
|
|
try:
|
|
|
|
|
if self.cassandra_username and self.cassandra_password:
|
2026-02-16 13:26:43 +00:00
|
|
|
self.tg = KGClass(
|
2025-09-30 16:02:33 +01:00
|
|
|
hosts=self.cassandra_host,
|
2025-12-05 21:45:30 +00:00
|
|
|
keyspace=user,
|
2025-09-30 16:02:33 +01:00
|
|
|
username=self.cassandra_username,
|
2026-04-15 14:52:01 +01:00
|
|
|
password=self.cassandra_password,
|
2025-09-30 16:02:33 +01:00
|
|
|
)
|
|
|
|
|
else:
|
2026-02-16 13:26:43 +00:00
|
|
|
self.tg = KGClass(
|
2025-09-30 16:02:33 +01:00
|
|
|
hosts=self.cassandra_host,
|
2025-12-05 21:45:30 +00:00
|
|
|
keyspace=user,
|
2025-09-30 16:02:33 +01:00
|
|
|
)
|
|
|
|
|
except Exception as e:
|
2025-12-05 21:45:30 +00:00
|
|
|
logger.error(f"Failed to connect to Cassandra for user {user}: {e}")
|
2025-09-30 16:02:33 +01:00
|
|
|
raise
|
2025-09-18 15:57:52 +01:00
|
|
|
|
2025-12-05 21:45:30 +00:00
|
|
|
self.table = user
|
2025-09-18 15:57:52 +01:00
|
|
|
|
2025-09-30 16:02:33 +01:00
|
|
|
# Delete all triples for this collection using the built-in method
|
2025-12-05 21:45:30 +00:00
|
|
|
self.tg.delete_collection(collection)
|
|
|
|
|
logger.info(f"Deleted all triples for collection {collection} from keyspace {user}")
|
2025-09-18 15:57:52 +01:00
|
|
|
|
2026-04-15 14:52:01 +01:00
|
|
|
try:
|
|
|
|
|
await asyncio.to_thread(_do_delete)
|
2025-09-18 15:57:52 +01:00
|
|
|
except Exception as e:
|
2025-12-05 21:45:30 +00:00
|
|
|
logger.error(f"Failed to delete collection {user}/{collection}: {e}", exc_info=True)
|
2025-09-18 15:57:52 +01:00
|
|
|
raise
|
|
|
|
|
|
2024-07-17 16:56:47 +01:00
|
|
|
@staticmethod
|
|
|
|
|
def add_args(parser):
|
2024-07-10 23:20:06 +01:00
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
TriplesStoreService.add_args(parser)
|
2025-09-03 23:41:22 +01:00
|
|
|
add_cassandra_args(parser)
|
2024-07-10 23:20:06 +01:00
|
|
|
|
|
|
|
|
def run():
|
|
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
Processor.launch(default_ident, __doc__)
|
2024-07-10 23:20:06 +01:00
|
|
|
|