Better proc group logging and concurrency (#810)

- Silence pika, cassandra etc. logging at INFO (too much chatter) 
- Add per processor log tags so that logs can be understood in
  processor group.
- Deal with RabbitMQ lag weirdness
- Added more processor group examples
This commit is contained in:
cybermaggedon 2026-04-15 14:52:01 +01:00 committed by GitHub
parent ce3c8b421b
commit 2bf4af294e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 1021 additions and 647 deletions

View file

@ -8,12 +8,51 @@ ensuring consistent log formats, levels, and command-line arguments.
Supports dual output to console and Loki for centralized log aggregation.
"""
import contextvars
import logging
import logging.handlers
from queue import Queue
import os
# The current processor id for this task context. Read by
# _ProcessorIdFilter to stamp every LogRecord with its owning
# processor, and read by logging_loki's emitter via record.tags
# to label log lines in Loki. ContextVar so asyncio subtasks
# inherit their parent supervisor's processor id automatically.
current_processor_id = contextvars.ContextVar(
"current_processor_id", default="unknown"
)
def set_processor_id(pid):
"""Set the processor id for the current task context.
All subsequent log records emitted from this task and any
asyncio tasks spawned from it will be tagged with this id
in the console format and in Loki labels.
"""
current_processor_id.set(pid)
class _ProcessorIdFilter(logging.Filter):
"""Stamps every LogRecord with processor_id from the contextvar.
Attaches two fields to each record:
record.processor_id used by the console format string
record.tags merged into Loki labels by logging_loki's
emitter (it reads record.tags and combines
with the handler's static tags)
"""
def filter(self, record):
pid = current_processor_id.get()
record.processor_id = pid
existing = getattr(record, "tags", None) or {}
record.tags = {**existing, "processor": pid}
return True
def add_logging_args(parser):
"""
Add standard logging arguments to an argument parser.
@ -87,12 +126,15 @@ def setup_logging(args):
loki_url = args.get('loki_url', 'http://loki:3100/loki/api/v1/push')
loki_username = args.get('loki_username')
loki_password = args.get('loki_password')
processor_id = args.get('id') # Processor identity (e.g., "config-svc", "text-completion")
try:
from logging_loki import LokiHandler
# Create Loki handler with optional authentication and processor label
# Create Loki handler with optional authentication. The
# processor label is NOT baked in here — it's stamped onto
# each record by _ProcessorIdFilter reading the task-local
# contextvar, and logging_loki's emitter reads record.tags
# to build per-record Loki labels.
loki_handler_kwargs = {
'url': loki_url,
'version': "1",
@ -101,10 +143,6 @@ def setup_logging(args):
if loki_username and loki_password:
loki_handler_kwargs['auth'] = (loki_username, loki_password)
# Add processor label if available (for consistency with Prometheus metrics)
if processor_id:
loki_handler_kwargs['tags'] = {'processor': processor_id}
loki_handler = LokiHandler(**loki_handler_kwargs)
# Wrap in QueueHandler for non-blocking operation
@ -133,23 +171,44 @@ def setup_logging(args):
print(f"WARNING: Failed to setup Loki logging: {e}")
print("Continuing with console-only logging")
# Get processor ID for log formatting (use 'unknown' if not available)
processor_id = args.get('id', 'unknown')
# Configure logging with all handlers
# Use processor ID as the primary identifier in logs
# Configure logging with all handlers. The processor id comes
# from _ProcessorIdFilter (via contextvar) and is injected into
# each record as record.processor_id. The format string reads
# that attribute on every emit.
logging.basicConfig(
level=getattr(logging, log_level.upper()),
format=f'%(asctime)s - {processor_id} - %(levelname)s - %(message)s',
format='%(asctime)s - %(processor_id)s - %(levelname)s - %(message)s',
handlers=handlers,
force=True # Force reconfiguration if already configured
)
# Prevent recursive logging from Loki's HTTP client
if loki_enabled and queue_listener:
# Disable urllib3 logging to prevent infinite loop
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('urllib3.connectionpool').setLevel(logging.WARNING)
# Attach the processor-id filter to every handler so all records
# passing through any sink get stamped (console, queue→loki,
# future handlers). Filters on handlers run regardless of which
# logger originated the record, so logs from pika, cassandra,
# processor code, etc. all pass through it.
processor_filter = _ProcessorIdFilter()
for h in handlers:
h.addFilter(processor_filter)
# Seed the contextvar from --id if one was supplied. In group
# mode --id isn't present; the processor_group supervisor sets
# it per task. In standalone mode AsyncProcessor.launch provides
# it via argparse default.
if args.get('id'):
set_processor_id(args['id'])
# Silence noisy third-party library loggers. These emit INFO-level
# chatter (connection churn, channel open/close, driver warnings) that
# drowns the useful signal and can't be attributed to a specific
# processor anyway. WARNING and above still propagate.
for noisy in (
'pika',
'cassandra',
'urllib3',
'urllib3.connectionpool',
):
logging.getLogger(noisy).setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
logger.info(f"Logging configured with level: {log_level}")

View file

@ -29,7 +29,7 @@ import time
from prometheus_client import start_http_server
from . logging import add_logging_args, setup_logging
from . logging import add_logging_args, setup_logging, set_processor_id
logger = logging.getLogger(__name__)
@ -64,6 +64,13 @@ async def _supervise(entry):
pid = entry["params"]["id"]
class_path = entry["class"]
# Stamp the contextvar for this supervisor task. Every log
# record emitted from this task — and from any inner TaskGroup
# child created by the processor — inherits this id via
# contextvar propagation. Siblings in the outer group set
# their own id in their own task context and do not interfere.
set_processor_id(pid)
while True:
try:

View file

@ -227,15 +227,30 @@ class RabbitMQBackendConsumer:
self._connect()
def receive(self, timeout_millis: int = 2000) -> Message:
"""Receive a message. Raises TimeoutError if none available."""
"""Receive a message. Raises TimeoutError if none available.
Loop ordering matters: check _incoming at the TOP of each
iteration, not as the loop condition. process_data_events
may dispatch a message via the _on_message callback during
the pump; we must re-check _incoming on the next iteration
before giving up on the deadline. The previous control
flow (`while deadline: check; pump`) could lose a wakeup if
the pump consumed the remainder of the window the
`while` check would fail before `_incoming` was re-read,
leaving a just-dispatched message stranded until the next
receive() call one full poll cycle later.
"""
if not self._is_alive():
self._connect()
timeout_seconds = timeout_millis / 1000.0
deadline = time.monotonic() + timeout_seconds
while time.monotonic() < deadline:
# Check if a message was already delivered
while True:
# Check if a message has been dispatched to our queue.
# This catches both (a) messages dispatched before this
# receive() was called and (b) messages dispatched
# during the previous iteration's process_data_events.
try:
method, properties, body = self._incoming.get_nowait()
return RabbitMQMessage(
@ -244,14 +259,16 @@ class RabbitMQBackendConsumer:
except queue.Empty:
pass
# Drive pika's I/O — delivers messages and processes heartbeats
remaining = deadline - time.monotonic()
if remaining > 0:
self._connection.process_data_events(
time_limit=min(0.1, remaining),
)
if remaining <= 0:
raise TimeoutError("No message received within timeout")
raise TimeoutError("No message received within timeout")
# Drive pika's I/O. Any messages delivered during this
# call land in _incoming via _on_message; the next
# iteration of this loop catches them at the top.
self._connection.process_data_events(
time_limit=min(0.1, remaining),
)
def acknowledge(self, message: Message) -> None:
if isinstance(message, RabbitMQMessage) and message._method: