Better proc group logging and concurrency (#810)

- Silence pika, cassandra etc. logging at INFO (too much chatter) 
- Add per processor log tags so that logs can be understood in
  processor group.
- Deal with RabbitMQ lag weirdness
- Added more processor group examples
This commit is contained in:
cybermaggedon 2026-04-15 14:52:01 +01:00 committed by GitHub
parent ce3c8b421b
commit 2bf4af294e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 1021 additions and 647 deletions

View file

@ -4,6 +4,7 @@ Embeddings service, applies an embeddings model using fastembed
Input is text, output is embeddings vector.
"""
import asyncio
import logging
from ... base import EmbeddingsService
@ -37,7 +38,13 @@ class Processor(EmbeddingsService):
self._load_model(model)
def _load_model(self, model_name):
"""Load a model, caching it for reuse"""
"""Load a model, caching it for reuse.
Synchronous CPU and I/O heavy. Callers that run on the
event loop must dispatch via asyncio.to_thread to avoid
freezing the loop (which, in processor-group deployments,
freezes every sibling processor in the same process).
"""
if self.cached_model_name != model_name:
logger.info(f"Loading FastEmbed model: {model_name}")
self.embeddings = TextEmbedding(model_name=model_name)
@ -46,6 +53,11 @@ class Processor(EmbeddingsService):
else:
logger.debug(f"Using cached model: {model_name}")
def _run_embed(self, texts):
"""Synchronous embed call. Runs in a worker thread via
asyncio.to_thread from on_embeddings."""
return list(self.embeddings.embed(texts))
async def on_embeddings(self, texts, model=None):
if not texts:
@ -53,11 +65,18 @@ class Processor(EmbeddingsService):
use_model = model or self.default_model
# Reload model if it has changed
self._load_model(use_model)
# Reload model if it has changed. Model loading is sync
# and can take seconds; push it to a worker thread so the
# event loop (and any sibling processors in group mode)
# stay responsive.
if self.cached_model_name != use_model:
await asyncio.to_thread(self._load_model, use_model)
# FastEmbed processes the full batch efficiently
vecs = list(self.embeddings.embed(texts))
# FastEmbed inference is synchronous ONNX runtime work.
# Dispatch to a worker thread so the event loop stays
# responsive for other tasks (important in group mode
# where the loop is shared across many processors).
vecs = await asyncio.to_thread(self._run_embed, texts)
# Return list of vectors, one per input text
return [v.tolist() for v in vecs]

View file

@ -23,6 +23,7 @@ from .... schema import RowsQueryRequest, RowsQueryResponse, GraphQLError
from .... schema import Error, RowSchema, Field as SchemaField
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
from .... base.cassandra_config import add_cassandra_args, resolve_cassandra_config
from .... tables.cassandra_async import async_execute
from ... graphql import GraphQLSchemaBuilder, SortDirection
@ -263,7 +264,7 @@ class Processor(FlowProcessor):
query += f" LIMIT {limit}"
try:
rows = self.session.execute(query, params)
rows = await async_execute(self.session, query, params)
for row in rows:
# Convert data map to dict with proper field names
row_dict = dict(row.data) if row.data else {}
@ -301,7 +302,7 @@ class Processor(FlowProcessor):
params = [collection, schema_name, primary_index]
try:
rows = self.session.execute(query, params)
rows = await async_execute(self.session, query, params)
for row in rows:
row_dict = dict(row.data) if row.data else {}

View file

@ -4,6 +4,7 @@ Triples query service. Input is a (s, p, o, g) quad pattern, some values may be
null. Output is a list of quads.
"""
import asyncio
import logging
import json
@ -200,7 +201,11 @@ class Processor(TriplesQueryService):
try:
self.ensure_connection(query.user)
# ensure_connection may construct a fresh
# EntityCentricKnowledgeGraph which does sync schema
# setup against Cassandra. Push it to a worker thread
# so the event loop doesn't block on first-use per user.
await asyncio.to_thread(self.ensure_connection, query.user)
# Extract values from query
s_val = get_term_value(query.s)
@ -218,14 +223,21 @@ class Processor(TriplesQueryService):
quads = []
# All self.tg.get_* calls below are sync wrappers around
# cassandra session.execute. Materialise inside a worker
# thread so iteration never triggers sync paging back on
# the event loop.
# Route to appropriate query method based on which fields are specified
if s_val is not None:
if p_val is not None:
if o_val is not None:
# SPO specified - find matching graphs
resp = self.tg.get_spo(
query.collection, s_val, p_val, o_val, g=g_val,
limit=query.limit
resp = await asyncio.to_thread(
lambda: list(self.tg.get_spo(
query.collection, s_val, p_val, o_val,
g=g_val, limit=query.limit,
))
)
for t in resp:
g = t.g if hasattr(t, 'g') else DEFAULT_GRAPH
@ -233,9 +245,11 @@ class Processor(TriplesQueryService):
quads.append((s_val, p_val, o_val, g, term_type, datatype, language))
else:
# SP specified
resp = self.tg.get_sp(
query.collection, s_val, p_val, g=g_val,
limit=query.limit
resp = await asyncio.to_thread(
lambda: list(self.tg.get_sp(
query.collection, s_val, p_val,
g=g_val, limit=query.limit,
))
)
for t in resp:
g = t.g if hasattr(t, 'g') else DEFAULT_GRAPH
@ -244,9 +258,11 @@ class Processor(TriplesQueryService):
else:
if o_val is not None:
# SO specified
resp = self.tg.get_os(
query.collection, o_val, s_val, g=g_val,
limit=query.limit
resp = await asyncio.to_thread(
lambda: list(self.tg.get_os(
query.collection, o_val, s_val,
g=g_val, limit=query.limit,
))
)
for t in resp:
g = t.g if hasattr(t, 'g') else DEFAULT_GRAPH
@ -254,9 +270,11 @@ class Processor(TriplesQueryService):
quads.append((s_val, t.p, o_val, g, term_type, datatype, language))
else:
# S only
resp = self.tg.get_s(
query.collection, s_val, g=g_val,
limit=query.limit
resp = await asyncio.to_thread(
lambda: list(self.tg.get_s(
query.collection, s_val,
g=g_val, limit=query.limit,
))
)
for t in resp:
g = t.g if hasattr(t, 'g') else DEFAULT_GRAPH
@ -266,9 +284,11 @@ class Processor(TriplesQueryService):
if p_val is not None:
if o_val is not None:
# PO specified
resp = self.tg.get_po(
query.collection, p_val, o_val, g=g_val,
limit=query.limit
resp = await asyncio.to_thread(
lambda: list(self.tg.get_po(
query.collection, p_val, o_val,
g=g_val, limit=query.limit,
))
)
for t in resp:
g = t.g if hasattr(t, 'g') else DEFAULT_GRAPH
@ -276,9 +296,11 @@ class Processor(TriplesQueryService):
quads.append((t.s, p_val, o_val, g, term_type, datatype, language))
else:
# P only
resp = self.tg.get_p(
query.collection, p_val, g=g_val,
limit=query.limit
resp = await asyncio.to_thread(
lambda: list(self.tg.get_p(
query.collection, p_val,
g=g_val, limit=query.limit,
))
)
for t in resp:
g = t.g if hasattr(t, 'g') else DEFAULT_GRAPH
@ -287,9 +309,11 @@ class Processor(TriplesQueryService):
else:
if o_val is not None:
# O only
resp = self.tg.get_o(
query.collection, o_val, g=g_val,
limit=query.limit
resp = await asyncio.to_thread(
lambda: list(self.tg.get_o(
query.collection, o_val,
g=g_val, limit=query.limit,
))
)
for t in resp:
g = t.g if hasattr(t, 'g') else DEFAULT_GRAPH
@ -297,9 +321,10 @@ class Processor(TriplesQueryService):
quads.append((t.s, t.p, o_val, g, term_type, datatype, language))
else:
# Nothing specified - get all
resp = self.tg.get_all(
query.collection,
limit=query.limit
resp = await asyncio.to_thread(
lambda: list(self.tg.get_all(
query.collection, limit=query.limit,
))
)
for t in resp:
# Note: quads_by_collection uses 'd' for graph field
@ -340,7 +365,7 @@ class Processor(TriplesQueryService):
Uses Cassandra's paging to fetch results incrementally.
"""
try:
self.ensure_connection(query.user)
await asyncio.to_thread(self.ensure_connection, query.user)
batch_size = query.batch_size if query.batch_size > 0 else 20
limit = query.limit if query.limit > 0 else 10000
@ -374,9 +399,16 @@ class Processor(TriplesQueryService):
yield batch, is_final
return
# Create statement with fetch_size for true streaming
# Materialise in a worker thread. We lose true streaming
# paging (the driver fetches all pages eagerly inside the
# thread) but the event loop stays responsive, and result
# sets at this layer are typically small enough that this
# is acceptable. If true async paging is needed later,
# revisit using ResponseFuture page callbacks.
statement = SimpleStatement(cql, fetch_size=batch_size)
result_set = self.tg.session.execute(statement, params)
result_set = await asyncio.to_thread(
lambda: list(self.tg.session.execute(statement, params))
)
batch = []
count = 0

View file

@ -13,6 +13,7 @@ Uses a single 'rows' table with the schema:
Each row is written multiple times - once per indexed field defined in the schema.
"""
import asyncio
import json
import logging
import re
@ -26,6 +27,7 @@ from .... schema import RowSchema, Field
from .... base import FlowProcessor, ConsumerSpec
from .... base import CollectionConfigHandler
from .... base.cassandra_config import add_cassandra_args, resolve_cassandra_config
from .... tables.cassandra_async import async_execute
# Module logger
logger = logging.getLogger(__name__)
@ -361,11 +363,15 @@ class Processor(CollectionConfigHandler, FlowProcessor):
schema_name = obj.schema_name
source = getattr(obj.metadata, 'source', '') or ''
# Ensure tables exist
self.ensure_tables(keyspace)
# Ensure tables exist (sync DDL — push to a worker thread
# so the event loop stays responsive when running in a
# processor group sharing the loop with siblings).
await asyncio.to_thread(self.ensure_tables, keyspace)
# Register partitions if first time seeing this (collection, schema_name)
self.register_partitions(keyspace, collection, schema_name)
await asyncio.to_thread(
self.register_partitions, keyspace, collection, schema_name
)
safe_keyspace = self.sanitize_name(keyspace)
@ -406,9 +412,10 @@ class Processor(CollectionConfigHandler, FlowProcessor):
continue
try:
self.session.execute(
await async_execute(
self.session,
insert_cql,
(collection, schema_name, index_name, index_value, data_map, source)
(collection, schema_name, index_name, index_value, data_map, source),
)
rows_written += 1
except Exception as e:
@ -425,18 +432,18 @@ class Processor(CollectionConfigHandler, FlowProcessor):
async def create_collection(self, user: str, collection: str, metadata: dict):
"""Create/verify collection exists in Cassandra row store"""
# Connect if not already connected
self.connect_cassandra()
# Connect if not already connected (sync, push to thread)
await asyncio.to_thread(self.connect_cassandra)
# Ensure tables exist
self.ensure_tables(user)
# Ensure tables exist (sync DDL, push to thread)
await asyncio.to_thread(self.ensure_tables, user)
logger.info(f"Collection {collection} ready for user {user}")
async def delete_collection(self, user: str, collection: str):
"""Delete all data for a specific collection using partition tracking"""
# Connect if not already connected
self.connect_cassandra()
await asyncio.to_thread(self.connect_cassandra)
safe_keyspace = self.sanitize_name(user)
@ -446,8 +453,10 @@ class Processor(CollectionConfigHandler, FlowProcessor):
SELECT keyspace_name FROM system_schema.keyspaces
WHERE keyspace_name = %s
"""
result = self.session.execute(check_keyspace_cql, (safe_keyspace,))
if not result.one():
result = await async_execute(
self.session, check_keyspace_cql, (safe_keyspace,)
)
if not result:
logger.info(f"Keyspace {safe_keyspace} does not exist, nothing to delete")
return
self.known_keyspaces.add(user)
@ -459,8 +468,9 @@ class Processor(CollectionConfigHandler, FlowProcessor):
"""
try:
partitions = self.session.execute(select_partitions_cql, (collection,))
partition_list = list(partitions)
partition_list = await async_execute(
self.session, select_partitions_cql, (collection,)
)
except Exception as e:
logger.error(f"Failed to query partitions for collection {collection}: {e}")
raise
@ -474,9 +484,10 @@ class Processor(CollectionConfigHandler, FlowProcessor):
partitions_deleted = 0
for partition in partition_list:
try:
self.session.execute(
await async_execute(
self.session,
delete_rows_cql,
(collection, partition.schema_name, partition.index_name)
(collection, partition.schema_name, partition.index_name),
)
partitions_deleted += 1
except Exception as e:
@ -493,7 +504,9 @@ class Processor(CollectionConfigHandler, FlowProcessor):
"""
try:
self.session.execute(delete_partitions_cql, (collection,))
await async_execute(
self.session, delete_partitions_cql, (collection,)
)
except Exception as e:
logger.error(f"Failed to clean up row_partitions for {collection}: {e}")
raise
@ -512,7 +525,7 @@ class Processor(CollectionConfigHandler, FlowProcessor):
async def delete_collection_schema(self, user: str, collection: str, schema_name: str):
"""Delete all data for a specific collection + schema combination"""
# Connect if not already connected
self.connect_cassandra()
await asyncio.to_thread(self.connect_cassandra)
safe_keyspace = self.sanitize_name(user)
@ -523,8 +536,9 @@ class Processor(CollectionConfigHandler, FlowProcessor):
"""
try:
partitions = self.session.execute(select_partitions_cql, (collection, schema_name))
partition_list = list(partitions)
partition_list = await async_execute(
self.session, select_partitions_cql, (collection, schema_name)
)
except Exception as e:
logger.error(
f"Failed to query partitions for {collection}/{schema_name}: {e}"
@ -540,9 +554,10 @@ class Processor(CollectionConfigHandler, FlowProcessor):
partitions_deleted = 0
for partition in partition_list:
try:
self.session.execute(
await async_execute(
self.session,
delete_rows_cql,
(collection, schema_name, partition.index_name)
(collection, schema_name, partition.index_name),
)
partitions_deleted += 1
except Exception as e:
@ -559,7 +574,11 @@ class Processor(CollectionConfigHandler, FlowProcessor):
"""
try:
self.session.execute(delete_partitions_cql, (collection, schema_name))
await async_execute(
self.session,
delete_partitions_cql,
(collection, schema_name),
)
except Exception as e:
logger.error(
f"Failed to clean up row_partitions for {collection}/{schema_name}: {e}"

View file

@ -3,6 +3,7 @@
Graph writer. Input is graph edge. Writes edges to Cassandra graph.
"""
import asyncio
import base64
import os
import argparse
@ -150,59 +151,71 @@ class Processor(CollectionConfigHandler, TriplesStoreService):
user = message.metadata.user
if self.table is None or self.table != user:
# The cassandra-driver work below — connection, schema
# setup, and per-triple inserts — is all synchronous.
# Wrap the whole batch in a worker thread so the event
# loop stays responsive for sibling processors when
# running in a processor group.
self.tg = None
def _do_store():
# Use factory function to select implementation
KGClass = EntityCentricKnowledgeGraph
if self.table is None or self.table != user:
try:
if self.cassandra_username and self.cassandra_password:
self.tg = KGClass(
hosts=self.cassandra_host,
keyspace=message.metadata.user,
username=self.cassandra_username, password=self.cassandra_password
)
else:
self.tg = KGClass(
hosts=self.cassandra_host,
keyspace=message.metadata.user,
)
except Exception as e:
logger.error(f"Exception: {e}", exc_info=True)
time.sleep(1)
raise e
self.tg = None
self.table = user
# Use factory function to select implementation
KGClass = EntityCentricKnowledgeGraph
for t in message.triples:
# Extract values from Term objects
s_val = get_term_value(t.s)
p_val = get_term_value(t.p)
o_val = get_term_value(t.o)
# t.g is None for default graph, or a graph IRI
g_val = t.g if t.g is not None else DEFAULT_GRAPH
try:
if self.cassandra_username and self.cassandra_password:
self.tg = KGClass(
hosts=self.cassandra_host,
keyspace=message.metadata.user,
username=self.cassandra_username,
password=self.cassandra_password,
)
else:
self.tg = KGClass(
hosts=self.cassandra_host,
keyspace=message.metadata.user,
)
except Exception as e:
logger.error(f"Exception: {e}", exc_info=True)
time.sleep(1)
raise e
# Extract object type metadata for entity-centric storage
otype = get_term_otype(t.o)
dtype = get_term_dtype(t.o)
lang = get_term_lang(t.o)
self.table = user
self.tg.insert(
message.metadata.collection,
s_val,
p_val,
o_val,
g=g_val,
otype=otype,
dtype=dtype,
lang=lang
)
for t in message.triples:
# Extract values from Term objects
s_val = get_term_value(t.s)
p_val = get_term_value(t.p)
o_val = get_term_value(t.o)
# t.g is None for default graph, or a graph IRI
g_val = t.g if t.g is not None else DEFAULT_GRAPH
# Extract object type metadata for entity-centric storage
otype = get_term_otype(t.o)
dtype = get_term_dtype(t.o)
lang = get_term_lang(t.o)
self.tg.insert(
message.metadata.collection,
s_val,
p_val,
o_val,
g=g_val,
otype=otype,
dtype=dtype,
lang=lang,
)
await asyncio.to_thread(_do_store)
async def create_collection(self, user: str, collection: str, metadata: dict):
"""Create a collection in Cassandra triple store via config push"""
try:
def _do_create():
# Create or reuse connection for this user's keyspace
if self.table is None or self.table != user:
self.tg = None
@ -216,7 +229,7 @@ class Processor(CollectionConfigHandler, TriplesStoreService):
hosts=self.cassandra_host,
keyspace=user,
username=self.cassandra_username,
password=self.cassandra_password
password=self.cassandra_password,
)
else:
self.tg = KGClass(
@ -238,13 +251,16 @@ class Processor(CollectionConfigHandler, TriplesStoreService):
self.tg.create_collection(collection)
logger.info(f"Created collection {collection}")
try:
await asyncio.to_thread(_do_create)
except Exception as e:
logger.error(f"Failed to create collection {user}/{collection}: {e}", exc_info=True)
raise
async def delete_collection(self, user: str, collection: str):
"""Delete all data for a specific collection from the unified triples table"""
try:
def _do_delete():
# Create or reuse connection for this user's keyspace
if self.table is None or self.table != user:
self.tg = None
@ -258,7 +274,7 @@ class Processor(CollectionConfigHandler, TriplesStoreService):
hosts=self.cassandra_host,
keyspace=user,
username=self.cassandra_username,
password=self.cassandra_password
password=self.cassandra_password,
)
else:
self.tg = KGClass(
@ -275,6 +291,8 @@ class Processor(CollectionConfigHandler, TriplesStoreService):
self.tg.delete_collection(collection)
logger.info(f"Deleted all triples for collection {collection} from keyspace {user}")
try:
await asyncio.to_thread(_do_delete)
except Exception as e:
logger.error(f"Failed to delete collection {user}/{collection}: {e}", exc_info=True)
raise

View file

@ -0,0 +1,78 @@
"""
Async wrapper for cassandra-driver sessions.
The cassandra driver exposes a callback-based async API via
session.execute_async, returning a ResponseFuture that fires
on_result / on_error from the driver's own worker thread.
This module bridges that into an awaitable interface.
Usage:
from ..tables.cassandra_async import async_execute
rows = await async_execute(self.cassandra, stmt, (param1, param2))
for row in rows:
...
Notes:
- Rows are materialised into a list inside the driver callback
thread before the future is resolved, so subsequent iteration
in the caller never triggers a sync page-fetch on the asyncio
loop. This is safe for single-page results (the common case
in this codebase); if a query needs pagination, handle it
explicitly.
- Callbacks fire on a driver worker thread; call_soon_threadsafe
is used to hand the result back to the asyncio loop.
- Errors from the driver are re-raised in the awaiting coroutine.
"""
import asyncio
async def async_execute(session, query, parameters=None):
"""Execute a CQL statement asynchronously.
Args:
session: cassandra.cluster.Session (self.cassandra)
query: statement string or PreparedStatement
parameters: tuple/list of bind params, or None
Returns:
A list of rows (materialised from the first result page).
"""
loop = asyncio.get_running_loop()
fut = loop.create_future()
def on_result(rows):
# Materialise on the driver thread so the loop thread
# never touches a lazy iterator that might trigger
# further sync I/O.
try:
materialised = list(rows) if rows is not None else []
except Exception as e:
loop.call_soon_threadsafe(
_set_exception_if_pending, fut, e
)
return
loop.call_soon_threadsafe(
_set_result_if_pending, fut, materialised
)
def on_error(exc):
loop.call_soon_threadsafe(
_set_exception_if_pending, fut, exc
)
rf = session.execute_async(query, parameters)
rf.add_callbacks(on_result, on_error)
return await fut
def _set_result_if_pending(fut, result):
if not fut.done():
fut.set_result(result)
def _set_exception_if_pending(fut, exc):
if not fut.done():
fut.set_exception(exc)

View file

@ -11,6 +11,8 @@ import time
import asyncio
import logging
from . cassandra_async import async_execute
logger = logging.getLogger(__name__)
class ConfigTableStore:
@ -102,21 +104,20 @@ class ConfigTableStore:
async def inc_version(self):
self.cassandra.execute("""
await async_execute(self.cassandra, """
UPDATE version set version = version + 1
WHERE id = 'version'
""")
async def get_version(self):
resp = self.cassandra.execute("""
rows = await async_execute(self.cassandra, """
SELECT version FROM version
WHERE id = 'version'
""")
row = resp.one()
if row: return row[0]
if rows:
return rows[0][0]
return None
@ -153,150 +154,91 @@ class ConfigTableStore:
""")
async def put_config(self, cls, key, value):
while True:
try:
resp = self.cassandra.execute(
self.put_config_stmt,
( cls, key, value )
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
await async_execute(
self.cassandra,
self.put_config_stmt,
(cls, key, value),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
async def get_value(self, cls, key):
try:
rows = await async_execute(
self.cassandra,
self.get_value_stmt,
(cls, key),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
while True:
try:
resp = self.cassandra.execute(
self.get_value_stmt,
( cls, key )
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
for row in resp:
for row in rows:
return row[0]
return None
async def get_values(self, cls):
try:
rows = await async_execute(
self.cassandra,
self.get_values_stmt,
(cls,),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
while True:
try:
resp = self.cassandra.execute(
self.get_values_stmt,
( cls, )
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
return [
[row[0], row[1]]
for row in resp
]
return [[row[0], row[1]] for row in rows]
async def get_classes(self):
try:
rows = await async_execute(
self.cassandra,
self.get_classes_stmt,
(),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
while True:
try:
resp = self.cassandra.execute(
self.get_classes_stmt,
()
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
return [
row[0] for row in resp
]
return [row[0] for row in rows]
async def get_all(self):
try:
rows = await async_execute(
self.cassandra,
self.get_all_stmt,
(),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
while True:
try:
resp = self.cassandra.execute(
self.get_all_stmt,
()
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
return [
(row[0], row[1], row[2])
for row in resp
]
return [(row[0], row[1], row[2]) for row in rows]
async def get_keys(self, cls):
try:
rows = await async_execute(
self.cassandra,
self.get_keys_stmt,
(cls,),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
while True:
try:
resp = self.cassandra.execute(
self.get_keys_stmt,
( cls, )
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
return [
row[0] for row in resp
]
return [row[0] for row in rows]
async def delete_key(self, cls, key):
while True:
try:
resp = self.cassandra.execute(
self.delete_key_stmt,
(cls, key)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
await async_execute(
self.cassandra,
self.delete_key_stmt,
(cls, key),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise

View file

@ -4,6 +4,8 @@ from .. schema import Metadata, Term, IRI, LITERAL, GraphEmbeddings
from cassandra.cluster import Cluster
from . cassandra_async import async_execute
def term_to_tuple(term):
"""Convert Term to (value, is_uri) tuple for database storage."""
@ -225,25 +227,19 @@ class KnowledgeTableStore:
for v in m.triples
]
while True:
try:
resp = self.cassandra.execute(
self.insert_triples_stmt,
(
uuid.uuid4(), m.metadata.user,
m.metadata.root or m.metadata.id, when,
[], triples,
)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
await async_execute(
self.cassandra,
self.insert_triples_stmt,
(
uuid.uuid4(), m.metadata.user,
m.metadata.root or m.metadata.id, when,
[], triples,
),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
async def add_graph_embeddings(self, m):
@ -257,25 +253,19 @@ class KnowledgeTableStore:
for v in m.entities
]
while True:
try:
resp = self.cassandra.execute(
self.insert_graph_embeddings_stmt,
(
uuid.uuid4(), m.metadata.user,
m.metadata.root or m.metadata.id, when,
[], entities,
)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
await async_execute(
self.cassandra,
self.insert_graph_embeddings_stmt,
(
uuid.uuid4(), m.metadata.user,
m.metadata.root or m.metadata.id, when,
[], entities,
),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
async def add_document_embeddings(self, m):
@ -289,50 +279,35 @@ class KnowledgeTableStore:
for v in m.chunks
]
while True:
try:
resp = self.cassandra.execute(
self.insert_document_embeddings_stmt,
(
uuid.uuid4(), m.metadata.user,
m.metadata.root or m.metadata.id, when,
[], chunks,
)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
await async_execute(
self.cassandra,
self.insert_document_embeddings_stmt,
(
uuid.uuid4(), m.metadata.user,
m.metadata.root or m.metadata.id, when,
[], chunks,
),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
async def list_kg_cores(self, user):
logger.debug("List kg cores...")
while True:
try:
rows = await async_execute(
self.cassandra,
self.list_cores_stmt,
(user,),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
try:
resp = self.cassandra.execute(
self.list_cores_stmt,
(user,)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
lst = [
row[1]
for row in resp
]
lst = [row[1] for row in rows]
logger.debug("Done")
@ -342,56 +317,41 @@ class KnowledgeTableStore:
logger.debug("Delete kg cores...")
while True:
try:
await async_execute(
self.cassandra,
self.delete_triples_stmt,
(user, document_id),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
try:
resp = self.cassandra.execute(
self.delete_triples_stmt,
(user, document_id)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
while True:
try:
resp = self.cassandra.execute(
self.delete_graph_embeddings_stmt,
(user, document_id)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
await async_execute(
self.cassandra,
self.delete_graph_embeddings_stmt,
(user, document_id),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
async def get_triples(self, user, document_id, receiver):
logger.debug("Get triples...")
while True:
try:
rows = await async_execute(
self.cassandra,
self.get_triples_stmt,
(user, document_id),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
try:
resp = self.cassandra.execute(
self.get_triples_stmt,
(user, document_id)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
for row in resp:
for row in rows:
if row[3]:
triples = [
@ -422,22 +382,17 @@ class KnowledgeTableStore:
logger.debug("Get GE...")
while True:
try:
rows = await async_execute(
self.cassandra,
self.get_graph_embeddings_stmt,
(user, document_id),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
try:
resp = self.cassandra.execute(
self.get_graph_embeddings_stmt,
(user, document_id)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
for row in resp:
for row in rows:
if row[3]:
entities = [

View file

@ -31,6 +31,8 @@ import time
import asyncio
import logging
from . cassandra_async import async_execute
logger = logging.getLogger(__name__)
class LibraryTableStore:
@ -321,18 +323,13 @@ class LibraryTableStore:
async def document_exists(self, user, id):
resp = self.cassandra.execute(
rows = await async_execute(
self.cassandra,
self.test_document_exists_stmt,
( user, id )
(user, id),
)
# If a row exists, document exists. It's a cursor, can't just
# count the length
for row in resp:
return True
return False
return bool(rows)
async def add_document(self, document, object_id):
@ -349,26 +346,20 @@ class LibraryTableStore:
parent_id = getattr(document, 'parent_id', '') or ''
document_type = getattr(document, 'document_type', 'source') or 'source'
while True:
try:
resp = self.cassandra.execute(
self.insert_document_stmt,
(
document.id, document.user, int(document.time * 1000),
document.kind, document.title, document.comments,
metadata, document.tags, object_id,
parent_id, document_type
)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
await async_execute(
self.cassandra,
self.insert_document_stmt,
(
document.id, document.user, int(document.time * 1000),
document.kind, document.title, document.comments,
metadata, document.tags, object_id,
parent_id, document_type
),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
logger.debug("Add complete")
@ -383,25 +374,19 @@ class LibraryTableStore:
for v in document.metadata
]
while True:
try:
resp = self.cassandra.execute(
self.update_document_stmt,
(
int(document.time * 1000), document.title,
document.comments, metadata, document.tags,
document.user, document.id
)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
await async_execute(
self.cassandra,
self.update_document_stmt,
(
int(document.time * 1000), document.title,
document.comments, metadata, document.tags,
document.user, document.id
),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
logger.debug("Update complete")
@ -409,23 +394,15 @@ class LibraryTableStore:
logger.info(f"Removing document {document_id}")
while True:
try:
resp = self.cassandra.execute(
self.delete_document_stmt,
(
user, document_id
)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
await async_execute(
self.cassandra,
self.delete_document_stmt,
(user, document_id),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
logger.debug("Delete complete")
@ -433,21 +410,15 @@ class LibraryTableStore:
logger.debug("List documents...")
while True:
try:
resp = self.cassandra.execute(
self.list_document_stmt,
(user,)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
rows = await async_execute(
self.cassandra,
self.list_document_stmt,
(user,),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
lst = [
DocumentMetadata(
@ -469,7 +440,7 @@ class LibraryTableStore:
parent_id = row[8] if row[8] else "",
document_type = row[9] if row[9] else "source",
)
for row in resp
for row in rows
]
logger.debug("Done")
@ -481,20 +452,15 @@ class LibraryTableStore:
logger.debug(f"List children for parent {parent_id}")
while True:
try:
resp = self.cassandra.execute(
self.list_children_stmt,
(parent_id,)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
rows = await async_execute(
self.cassandra,
self.list_children_stmt,
(parent_id,),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
lst = [
DocumentMetadata(
@ -516,7 +482,7 @@ class LibraryTableStore:
parent_id = row[9] if row[9] else "",
document_type = row[10] if row[10] else "source",
)
for row in resp
for row in rows
]
logger.debug("Done")
@ -527,23 +493,17 @@ class LibraryTableStore:
logger.debug("Get document")
while True:
try:
rows = await async_execute(
self.cassandra,
self.get_document_stmt,
(user, id),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
try:
resp = self.cassandra.execute(
self.get_document_stmt,
(user, id)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
for row in resp:
for row in rows:
doc = DocumentMetadata(
id = id,
user = user,
@ -573,23 +533,17 @@ class LibraryTableStore:
logger.debug("Get document obj ID")
while True:
try:
rows = await async_execute(
self.cassandra,
self.get_document_stmt,
(user, id),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
try:
resp = self.cassandra.execute(
self.get_document_stmt,
(user, id)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
for row in resp:
for row in rows:
logger.debug("Done")
return row[6]
@ -597,43 +551,32 @@ class LibraryTableStore:
async def processing_exists(self, user, id):
resp = self.cassandra.execute(
rows = await async_execute(
self.cassandra,
self.test_processing_exists_stmt,
( user, id )
(user, id),
)
# If a row exists, document exists. It's a cursor, can't just
# count the length
for row in resp:
return True
return False
return bool(rows)
async def add_processing(self, processing):
logger.info(f"Adding processing {processing.id}")
while True:
try:
resp = self.cassandra.execute(
self.insert_processing_stmt,
(
processing.id, processing.document_id,
int(processing.time * 1000), processing.flow,
processing.user, processing.collection,
processing.tags
)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
await async_execute(
self.cassandra,
self.insert_processing_stmt,
(
processing.id, processing.document_id,
int(processing.time * 1000), processing.flow,
processing.user, processing.collection,
processing.tags
),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
logger.debug("Add complete")
@ -641,23 +584,15 @@ class LibraryTableStore:
logger.info(f"Removing processing {processing_id}")
while True:
try:
resp = self.cassandra.execute(
self.delete_processing_stmt,
(
user, processing_id
)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
await async_execute(
self.cassandra,
self.delete_processing_stmt,
(user, processing_id),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
logger.debug("Delete complete")
@ -665,21 +600,15 @@ class LibraryTableStore:
logger.debug("List processing objects")
while True:
try:
resp = self.cassandra.execute(
self.list_processing_stmt,
(user,)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
rows = await async_execute(
self.cassandra,
self.list_processing_stmt,
(user,),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
lst = [
ProcessingMetadata(
@ -691,7 +620,7 @@ class LibraryTableStore:
collection = row[4],
tags = row[5] if row[5] else [],
)
for row in resp
for row in rows
]
logger.debug("Done")
@ -718,20 +647,19 @@ class LibraryTableStore:
now = int(time.time() * 1000)
while True:
try:
self.cassandra.execute(
self.insert_upload_session_stmt,
(
upload_id, user, document_id, document_metadata,
s3_upload_id, object_id, total_size, chunk_size,
total_chunks, {}, now, now
)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
await async_execute(
self.cassandra,
self.insert_upload_session_stmt,
(
upload_id, user, document_id, document_metadata,
s3_upload_id, object_id, total_size, chunk_size,
total_chunks, {}, now, now
),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
logger.debug("Upload session created")
@ -740,18 +668,17 @@ class LibraryTableStore:
logger.debug(f"Get upload session {upload_id}")
while True:
try:
resp = self.cassandra.execute(
self.get_upload_session_stmt,
(upload_id,)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
rows = await async_execute(
self.cassandra,
self.get_upload_session_stmt,
(upload_id,),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
for row in resp:
for row in rows:
session = {
"upload_id": row[0],
"user": row[1],
@ -778,20 +705,19 @@ class LibraryTableStore:
now = int(time.time() * 1000)
while True:
try:
self.cassandra.execute(
self.update_upload_session_chunk_stmt,
(
{chunk_index: etag},
now,
upload_id
)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
await async_execute(
self.cassandra,
self.update_upload_session_chunk_stmt,
(
{chunk_index: etag},
now,
upload_id
),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
logger.debug("Chunk recorded")
@ -800,16 +726,15 @@ class LibraryTableStore:
logger.info(f"Deleting upload session {upload_id}")
while True:
try:
self.cassandra.execute(
self.delete_upload_session_stmt,
(upload_id,)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
await async_execute(
self.cassandra,
self.delete_upload_session_stmt,
(upload_id,),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
logger.debug("Upload session deleted")
@ -818,19 +743,18 @@ class LibraryTableStore:
logger.debug(f"List upload sessions for {user}")
while True:
try:
resp = self.cassandra.execute(
self.list_upload_sessions_stmt,
(user,)
)
break
except Exception as e:
logger.error("Exception occurred", exc_info=True)
raise e
try:
rows = await async_execute(
self.cassandra,
self.list_upload_sessions_stmt,
(user,),
)
except Exception:
logger.error("Exception occurred", exc_info=True)
raise
sessions = []
for row in resp:
for row in rows:
chunks_received = row[6] if row[6] else {}
sessions.append({
"upload_id": row[0],