trustgraph/trustgraph-base/trustgraph/base/processor_group.py


# Multi-processor group runner.  Runs multiple AsyncProcessor descendants
# as concurrent tasks inside a single process, sharing one event loop,
# one Prometheus HTTP server, and one pub/sub backend pool.
#
# Intended for dev and resource-constrained deployments.  Scale deployments
# should continue to use per-processor endpoints.
#
# Group config is a YAML or JSON file with shape:
#
#   processors:
#     - class: trustgraph.extract.kg.definitions.extract.Processor
#       params:
#         id: kg-extract-definitions
#         triples_batch_size: 1000
#     - class: trustgraph.chunking.recursive.Processor
#       params:
#         id: chunker-recursive
#
# Each entry's params are passed directly to the class constructor alongside
# the shared taskgroup.  Defaults live inside each processor class.

import argparse
import asyncio
import importlib
import json
import logging
import time

from prometheus_client import start_http_server

from . logging import add_logging_args, setup_logging, set_processor_id

logger = logging.getLogger(__name__)


def _load_config(path):
    with open(path) as f:
        text = f.read()
    if path.endswith((".yaml", ".yml")):
        import yaml
        return yaml.safe_load(text)
    return json.loads(text)


def _resolve_class(dotted):
    module_path, _, class_name = dotted.rpartition(".")
    if not module_path:
        raise ValueError(
            f"Processor class must be a dotted path, got {dotted!r}"
        )
    module = importlib.import_module(module_path)
    return getattr(module, class_name)


RESTART_DELAY_SECONDS = 4


async def _supervise(entry):
    """Run one processor with its own nested TaskGroup, restarting on any
    failure.  Each processor is isolated from its siblings — a crash here
    does not propagate to the outer group."""

    pid = entry["params"]["id"]
    class_path = entry["class"]

    # Stamp the contextvar for this supervisor task.  Every log
    # record emitted from this task — and from any inner TaskGroup
    # child created by the processor — inherits this id via
    # contextvar propagation.  Siblings in the outer group set
    # their own id in their own task context and do not interfere.
    set_processor_id(pid)

    while True:

        try:

            async with asyncio.TaskGroup() as inner_tg:

                cls = _resolve_class(class_path)
                params = dict(entry.get("params", {}))
                params["taskgroup"] = inner_tg

                logger.info(f"Starting {class_path} as {pid}")

                p = cls(**params)
                await p.start()
                inner_tg.create_task(p.run())

            # Clean exit — processor's run() returned without raising.
            # Treat as a transient shutdown and restart, matching the
            # behaviour of per-container `restart: on-failure`.
            logger.warning(
                f"Processor {pid} exited cleanly, will restart"
            )

        except asyncio.CancelledError:
            logger.info(f"Processor {pid} cancelled")
            raise

        except BaseExceptionGroup as eg:
            for e in eg.exceptions:
                logger.error(
                    f"Processor {pid} failure: {type(e).__name__}: {e}",
                    exc_info=e,
                )

        except Exception as e:
            logger.error(
                f"Processor {pid} failure: {type(e).__name__}: {e}",
                exc_info=True,
            )

        logger.info(
            f"Restarting {pid} in {RESTART_DELAY_SECONDS}s..."
        )
        await asyncio.sleep(RESTART_DELAY_SECONDS)


async def run_group(config):

    entries = config.get("processors", [])
    if not entries:
        raise RuntimeError("Group config has no processors")

    seen_ids = set()
    for entry in entries:
        pid = entry.get("params", {}).get("id")
        if pid is None:
            raise RuntimeError(
                f"Entry {entry.get('class')!r} missing params.id — "
                f"required for metrics labelling"
            )
        if pid in seen_ids:
            raise RuntimeError(f"Duplicate processor id {pid!r} in group")
        seen_ids.add(pid)

    async with asyncio.TaskGroup() as outer_tg:
        for entry in entries:
            outer_tg.create_task(_supervise(entry))


def run():

    parser = argparse.ArgumentParser(
        prog="processor-group",
        description="Run multiple processors as tasks in one process",
    )

    parser.add_argument(
        "-c", "--config",
        required=True,
        help="Path to group config file (JSON or YAML)",
    )

    parser.add_argument(
        "--metrics",
        action=argparse.BooleanOptionalAction,
        default=True,
        help="Metrics enabled (default: true)",
    )

    parser.add_argument(
        "-P", "--metrics-port",
        type=int,
        default=8000,
        help="Prometheus metrics port (default: 8000)",
    )

    add_logging_args(parser)

    args = vars(parser.parse_args())

    setup_logging(args)

    config = _load_config(args["config"])

    if args["metrics"]:
        start_http_server(args["metrics_port"])

    while True:

        logger.info("Starting group...")

        try:
            asyncio.run(run_group(config))

        except KeyboardInterrupt:
            logger.info("Keyboard interrupt.")
            return

        except ExceptionGroup as e:
            logger.error("Exception group:")
            for se in e.exceptions:
                logger.error(f"  Type: {type(se)}")
                logger.error(f"  Exception: {se}", exc_info=se)

        except Exception as e:
            logger.error(f"Type: {type(e)}")
            logger.error(f"Exception: {e}", exc_info=True)

        logger.warning("Will retry...")
        time.sleep(4)
        logger.info("Retrying...")