Better proc group logging and concurrency (#810)

- Silence pika, cassandra etc. logging at INFO (too much chatter) - Add per processor log tags so that logs can be understood in processor group. - Deal with RabbitMQ lag weirdness - Added more processor group examples
2026-06-29 08:29:38 +02:00 · 2026-04-15 14:52:01 +01:00 · 2026-04-15 14:52:01 +01:00 · 2bf4af294e
commit 2bf4af294e
parent ce3c8b421b
20 changed files with 1021 additions and 647 deletions
--- a/trustgraph-base/trustgraph/base/logging.py
+++ b/trustgraph-base/trustgraph/base/logging.py
@ -8,12 +8,51 @@ ensuring consistent log formats, levels, and command-line arguments.
 Supports dual output to console and Loki for centralized log aggregation.
 """

+import contextvars
 import logging
 import logging.handlers
 from queue import Queue
 import os


+# The current processor id for this task context.  Read by
+# _ProcessorIdFilter to stamp every LogRecord with its owning
+# processor, and read by logging_loki's emitter via record.tags
+# to label log lines in Loki.  ContextVar so asyncio subtasks
+# inherit their parent supervisor's processor id automatically.
+current_processor_id = contextvars.ContextVar(
+    "current_processor_id", default="unknown"
+)
+
+
+def set_processor_id(pid):
+    """Set the processor id for the current task context.
+
+    All subsequent log records emitted from this task — and any
+    asyncio tasks spawned from it — will be tagged with this id
+    in the console format and in Loki labels.
+    """
+    current_processor_id.set(pid)
+
+
+class _ProcessorIdFilter(logging.Filter):
+    """Stamps every LogRecord with processor_id from the contextvar.
+
+    Attaches two fields to each record:
+        record.processor_id — used by the console format string
+        record.tags         — merged into Loki labels by logging_loki's
+                              emitter (it reads record.tags and combines
+                              with the handler's static tags)
+    """
+
+    def filter(self, record):
+        pid = current_processor_id.get()
+        record.processor_id = pid
+        existing = getattr(record, "tags", None) or {}
+        record.tags = {**existing, "processor": pid}
+        return True
+
+
 def add_logging_args(parser):
    """
    Add standard logging arguments to an argument parser.
@ -87,12 +126,15 @@ def setup_logging(args):
        loki_url = args.get('loki_url', 'http://loki:3100/loki/api/v1/push')
        loki_username = args.get('loki_username')
        loki_password = args.get('loki_password')
-        processor_id = args.get('id')  # Processor identity (e.g., "config-svc", "text-completion")

        try:
            from logging_loki import LokiHandler

-            # Create Loki handler with optional authentication and processor label
+            # Create Loki handler with optional authentication.  The
+            # processor label is NOT baked in here — it's stamped onto
+            # each record by _ProcessorIdFilter reading the task-local
+            # contextvar, and logging_loki's emitter reads record.tags
+            # to build per-record Loki labels.
            loki_handler_kwargs = {
                'url': loki_url,
                'version': "1",
@ -101,10 +143,6 @@ def setup_logging(args):
            if loki_username and loki_password:
                loki_handler_kwargs['auth'] = (loki_username, loki_password)

-            # Add processor label if available (for consistency with Prometheus metrics)
-            if processor_id:
-                loki_handler_kwargs['tags'] = {'processor': processor_id}
-
            loki_handler = LokiHandler(**loki_handler_kwargs)

            # Wrap in QueueHandler for non-blocking operation
@ -133,23 +171,44 @@ def setup_logging(args):
            print(f"WARNING: Failed to setup Loki logging: {e}")
            print("Continuing with console-only logging")

-    # Get processor ID for log formatting (use 'unknown' if not available)
-    processor_id = args.get('id', 'unknown')
-
-    # Configure logging with all handlers
-    # Use processor ID as the primary identifier in logs
+    # Configure logging with all handlers.  The processor id comes
+    # from _ProcessorIdFilter (via contextvar) and is injected into
+    # each record as record.processor_id.  The format string reads
+    # that attribute on every emit.
    logging.basicConfig(
        level=getattr(logging, log_level.upper()),
-        format=f'%(asctime)s - {processor_id} - %(levelname)s - %(message)s',
+        format='%(asctime)s - %(processor_id)s - %(levelname)s - %(message)s',
        handlers=handlers,
        force=True  # Force reconfiguration if already configured
    )

-    # Prevent recursive logging from Loki's HTTP client
-    if loki_enabled and queue_listener:
-        # Disable urllib3 logging to prevent infinite loop
-        logging.getLogger('urllib3').setLevel(logging.WARNING)
-        logging.getLogger('urllib3.connectionpool').setLevel(logging.WARNING)
+    # Attach the processor-id filter to every handler so all records
+    # passing through any sink get stamped (console, queue→loki,
+    # future handlers).  Filters on handlers run regardless of which
+    # logger originated the record, so logs from pika, cassandra,
+    # processor code, etc. all pass through it.
+    processor_filter = _ProcessorIdFilter()
+    for h in handlers:
+        h.addFilter(processor_filter)
+
+    # Seed the contextvar from --id if one was supplied.  In group
+    # mode --id isn't present; the processor_group supervisor sets
+    # it per task.  In standalone mode AsyncProcessor.launch provides
+    # it via argparse default.
+    if args.get('id'):
+        set_processor_id(args['id'])
+
+    # Silence noisy third-party library loggers.  These emit INFO-level
+    # chatter (connection churn, channel open/close, driver warnings) that
+    # drowns the useful signal and can't be attributed to a specific
+    # processor anyway.  WARNING and above still propagate.
+    for noisy in (
+        'pika',
+        'cassandra',
+        'urllib3',
+        'urllib3.connectionpool',
+    ):
+        logging.getLogger(noisy).setLevel(logging.WARNING)

    logger = logging.getLogger(__name__)
    logger.info(f"Logging configured with level: {log_level}")
--- a/trustgraph-base/trustgraph/base/processor_group.py
+++ b/trustgraph-base/trustgraph/base/processor_group.py
@ -29,7 +29,7 @@ import time

 from prometheus_client import start_http_server

-from . logging import add_logging_args, setup_logging
+from . logging import add_logging_args, setup_logging, set_processor_id

 logger = logging.getLogger(__name__)

@ -64,6 +64,13 @@ async def _supervise(entry):
    pid = entry["params"]["id"]
    class_path = entry["class"]

+    # Stamp the contextvar for this supervisor task.  Every log
+    # record emitted from this task — and from any inner TaskGroup
+    # child created by the processor — inherits this id via
+    # contextvar propagation.  Siblings in the outer group set
+    # their own id in their own task context and do not interfere.
+    set_processor_id(pid)
+
    while True:

        try:
--- a/trustgraph-base/trustgraph/base/rabbitmq_backend.py
+++ b/trustgraph-base/trustgraph/base/rabbitmq_backend.py
@ -227,15 +227,30 @@ class RabbitMQBackendConsumer:
            self._connect()

    def receive(self, timeout_millis: int = 2000) -> Message:
-        """Receive a message. Raises TimeoutError if none available."""
+        """Receive a message. Raises TimeoutError if none available.
+
+        Loop ordering matters: check _incoming at the TOP of each
+        iteration, not as the loop condition.  process_data_events
+        may dispatch a message via the _on_message callback during
+        the pump; we must re-check _incoming on the next iteration
+        before giving up on the deadline.  The previous control
+        flow (`while deadline: check; pump`) could lose a wakeup if
+        the pump consumed the remainder of the window — the
+        `while` check would fail before `_incoming` was re-read,
+        leaving a just-dispatched message stranded until the next
+        receive() call one full poll cycle later.
+        """
        if not self._is_alive():
            self._connect()

        timeout_seconds = timeout_millis / 1000.0
        deadline = time.monotonic() + timeout_seconds

-        while time.monotonic() < deadline:
-            # Check if a message was already delivered
+        while True:
+            # Check if a message has been dispatched to our queue.
+            # This catches both (a) messages dispatched before this
+            # receive() was called and (b) messages dispatched
+            # during the previous iteration's process_data_events.
            try:
                method, properties, body = self._incoming.get_nowait()
                return RabbitMQMessage(
@ -244,14 +259,16 @@ class RabbitMQBackendConsumer:
            except queue.Empty:
                pass

-            # Drive pika's I/O — delivers messages and processes heartbeats
            remaining = deadline - time.monotonic()
-            if remaining > 0:
-                self._connection.process_data_events(
-                    time_limit=min(0.1, remaining),
-                )
+            if remaining <= 0:
+                raise TimeoutError("No message received within timeout")

-        raise TimeoutError("No message received within timeout")
+            # Drive pika's I/O.  Any messages delivered during this
+            # call land in _incoming via _on_message; the next
+            # iteration of this loop catches them at the top.
+            self._connection.process_data_events(
+                time_limit=min(0.1, remaining),
+            )

    def acknowledge(self, message: Message) -> None:
        if isinstance(message, RabbitMQMessage) and message._method: