remove dependency on docker-compose when starting up archgw (#305)

2026-05-04 05:12:55 +02:00 · 2024-11-26 13:13:02 -08:00 · 2024-11-26 13:13:02 -08:00 · 0ff3d43008
commit 0ff3d43008
parent 726f1a3185
16 changed files with 3761 additions and 274 deletions
--- a/arch/docker-compose.yaml
+++ b/arch/docker-compose.yaml
@ -1,24 +0,0 @@
 services:
  archgw:
    image: katanemo/archgw:latest
    ports:
      - "10000:10000"
      - "10001:10001"
      - "11000:11000"
      - "12000:12000"
      - "19901:9901"
    volumes:
      - ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/app/arch_config.yaml
      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
      - ~/archgw_logs:/var/log/
    env_file:
      - env.list
    environment:
      - OTEL_TRACING_HTTP_ENDPOINT=http://host.docker.internal:4318/v1/traces
    extra_hosts:
      - "host.docker.internal:host-gateway"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:10000/healthz"]
      interval: 30s
      timeout: 10s
      retries: 3
--- a/arch/tools/build_cli.sh
+++ b/arch/tools/build_cli.sh
@ -1,23 +1,4 @@
 #!/bin/bash
 # Define paths
 source_schema="../arch_config_schema.yaml"
 source_compose="../docker-compose.yaml"
 destination_dir="config"
 # Ensure the destination directory exists only if it doesn't already
 if [ ! -d "$destination_dir" ]; then
    mkdir -p "$destination_dir"
    echo "Directory $destination_dir created."
 fi
 # Copy the files
 cp "$source_schema" "$destination_dir/arch_config_schema.yaml"
 cp "$source_compose" "$destination_dir/docker-compose.yaml"
 touch "$destination_dir/env.list"
 # Print success message
 echo "Files copied successfully!"
 echo "Building the cli"
 poetry install
--- a/arch/tools/cli/config_generator.py
+++ b/arch/tools/cli/config_generator.py
@ -56,7 +56,6 @@ def validate_and_render_schema():
                    "port": 80,  # default port
                }
    print(inferred_clusters)
    endpoints = config_yaml.get("endpoints", {})
    # override the inferred clusters with the ones defined in the config
@ -88,7 +87,6 @@ def validate_and_render_schema():
    }
    rendered = template.render(data)
    print(rendered)
    print(ENVOY_CONFIG_FILE_RENDERED)
    with open(ENVOY_CONFIG_FILE_RENDERED, "w") as file:
        file.write(rendered)
@ -108,7 +106,7 @@ def validate_prompt_config(arch_config_file, arch_config_schema_file):
        validate(config_yaml, config_schema_yaml)
    except Exception as e:
        print(
-            f"Error validating arch_config file: {arch_config_file}, error: {e.message}"
+            f"Error validating arch_config file: {arch_config_file}, schema file: {arch_config_schema_file}, error: {e.message}"
        )
        raise e
--- a/arch/tools/cli/consts.py
+++ b/arch/tools/cli/consts.py
@ -9,3 +9,5 @@ SERVICE_NAME_MODEL_SERVER = "model_server"
 SERVICE_ALL = "all"
 MODEL_SERVER_LOG_FILE = "~/archgw_logs/modelserver.log"
 ACCESS_LOG_FILES = "~/archgw_logs/access*"
 ARCHGW_DOCKER_NAME = "archgw"
 ARCHGW_DOCKER_IMAGE = "katanemo/archgw:latest"
--- a/arch/tools/cli/core.py
+++ b/arch/tools/cli/core.py
@ -1,40 +1,74 @@
 import subprocess
 import os
 import time
 import pkg_resources
 import select
 import sys
 import glob
-from cli.utils import run_docker_compose_ps, print_service_status, check_services_state
+import docker
 from cli.utils import getLogger
 from cli.consts import (
    ARCHGW_DOCKER_IMAGE,
    ARCHGW_DOCKER_NAME,
    KATANEMO_LOCAL_MODEL_LIST,
    MODEL_SERVER_LOG_FILE,
    ACCESS_LOG_FILES,
 )
 from huggingface_hub import snapshot_download
 from dotenv import dotenv_values
 log = getLogger(__name__)
 def start_archgw_docker(client, arch_config_file, env):
    logs_path = "~/archgw_logs"
    logs_path_abs = os.path.expanduser(logs_path)
    return client.containers.run(
        name=ARCHGW_DOCKER_NAME,
        image=ARCHGW_DOCKER_IMAGE,
        detach=True,  # Run in detached mode
        ports={
            "10000/tcp": 10000,
            "10001/tcp": 10001,
            "11000/tcp": 11000,
            "12000/tcp": 12000,
            "19901/tcp": 19901,
        },
        volumes={
            f"{arch_config_file}": {
                "bind": "/app/arch_config.yaml",
                "mode": "ro",
            },
            "/etc/ssl/cert.pem": {"bind": "/etc/ssl/cert.pem", "mode": "ro"},
            logs_path_abs: {"bind": "/var/log"},
        },
        environment={
            "OTEL_TRACING_HTTP_ENDPOINT": "http://host.docker.internal:4318/v1/traces",
            **env,
        },
        extra_hosts={"host.docker.internal": "host-gateway"},
        healthcheck={
            "test": ["CMD", "curl", "-f", "http://localhost:10000/healthz"],
            "interval": 5000000000,  # 5 seconds
            "timeout": 1000000000,  # 1 seconds
            "retries": 3,
        },
    )
 def stream_gateway_logs(follow):
    """
    Stream logs from the arch gateway service.
    """
    compose_file = pkg_resources.resource_filename(
        __name__, "../config/docker-compose.yaml"
    )
    log.info("Logs from arch gateway service.")
-    options = ["docker", "compose", "-p", "arch", "logs"]
+    options = ["docker", "logs", "archgw"]
    if follow:
        options.append("-f")
    try:
        # Run `docker-compose logs` to stream logs from the gateway service
        subprocess.run(
            options,
            cwd=os.path.dirname(compose_file),
            check=True,
            stdout=sys.stdout,
            stderr=sys.stderr,
@ -88,42 +122,20 @@ def start_arch(arch_config_file, env, log_timeout=120):
    Start Docker Compose in detached mode and stream logs until services are healthy.
    Args:
-        path (str): The path where the prompt_confi.yml file is located.
+        path (str): The path where the prompt_config.yml file is located.
        log_timeout (int): Time in seconds to show logs before checking for healthy state.
    """
    log.info("Starting arch gateway")
    compose_file = pkg_resources.resource_filename(
        __name__, "../config/docker-compose.yaml"
    )
    try:
-        # Run the Docker Compose command in detached mode (-d)
+        client = docker.from_env()
-        subprocess.run(
+
-            [
+        container = start_archgw_docker(client, arch_config_file, env)
                "docker",
                "compose",
                "-p",
                "arch",
                "up",
                "-d",
            ],
            cwd=os.path.dirname(
                compose_file
            ),  # Ensure the Docker command runs in the correct path
            env=env,  # Pass the modified environment
            check=True,  # Raise an exception if the command fails
            stderr=subprocess.PIPE,
            stdout=subprocess.PIPE,
        )
        log.info(f"Arch docker-compose started in detached.")
        start_time = time.time()
        services_status = {}
        services_running = (
            False  # assume that the services are not running at the moment
        )
        while True:
            container = client.containers.get(container.id)
            current_time = time.time()
            elapsed_time = current_time - start_time
@ -132,53 +144,16 @@ def start_arch(arch_config_file, env, log_timeout=120):
                log.info(f"Stopping log monitoring after {log_timeout} seconds.")
                break
-            current_services_status = run_docker_compose_ps(
+            container_status = container.attrs["State"]["Health"]["Status"]
-                compose_file=compose_file, env=env
+
-            )
+            if container_status == "healthy":
-            if not current_services_status:
+                log.info("Container is healthy!")
                log.info(
                    "Status for the services could not be detected. Something went wrong. Please run docker logs"
                )
                break
            else:
                log.info(f"Container health status: {container_status}")
                time.sleep(1)
-            if not services_status:
+    except docker.errors.APIError as e:
                services_status = current_services_status  # set the first time
                print_service_status(
                    services_status
                )  # print the services status and proceed.
            # check if anyone service is failed or exited state, if so print and break out
            unhealthy_states = ["unhealthy", "exit", "exited", "dead", "bad"]
            running_states = ["running", "up"]
            if check_services_state(current_services_status, running_states):
                log.info("Arch gateway is up and running!")
                break
            if check_services_state(current_services_status, unhealthy_states):
                log.info(
                    "One or more Arch services are unhealthy. Please run `docker logs` for more information"
                )
                print_service_status(
                    current_services_status
                )  # print the services status and proceed.
                break
            # check to see if the status of one of the services has changed from prior. Print and loop over until finish, or error
            for service_name in services_status.keys():
                if (
                    services_status[service_name]["State"]
                    != current_services_status[service_name]["State"]
                ):
                    log.info(
                        "One or more Arch services have changed state. Printing current state"
                    )
                    print_service_status(current_services_status)
                    break
            services_status = current_services_status
    except subprocess.CalledProcessError as e:
        log.info(f"Failed to start Arch: {str(e)}")
@ -189,21 +164,16 @@ def stop_arch():
    Args:
        path (str): The path where the docker-compose.yml file is located.
    """
    compose_file = pkg_resources.resource_filename(
        __name__, "../config/docker-compose.yaml"
    )
    log.info("Shutting down arch gateway service.")
    try:
        # Run `docker-compose down` to shut down all services
        subprocess.run(
-            ["docker", "compose", "-p", "arch", "down"],
+            ["docker", "stop", "archgw"],
            cwd=os.path.dirname(compose_file),
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        subprocess.run(
            ["docker", "remove", "archgw"],
        )
        log.info("Successfully shut down arch gateway service.")
    except subprocess.CalledProcessError as e:
--- a/arch/tools/cli/main.py
+++ b/arch/tools/cli/main.py
@ -1,13 +1,16 @@
 import click
 import os
 import pkg_resources
 import sys
 import subprocess
 import multiprocessing
 import importlib.metadata
 from cli import targets
-from cli import config_generator
+from cli.utils import (
-from cli.utils import getLogger, get_llm_provider_access_keys, load_env_file_to_dict
+    getLogger,
    get_llm_provider_access_keys,
    load_env_file_to_dict,
    validate_schema,
 )
 from cli.core import (
    start_arch_modelserver,
    stop_arch_modelserver,
@ -160,17 +163,12 @@ def up(file, path, service):
        return
    log.info(f"Validating {arch_config_file}")
    arch_schema_config = pkg_resources.resource_filename(
        __name__, "../config/arch_config_schema.yaml"
    )
    try:
-        config_generator.validate_prompt_config(
+        validate_schema(arch_config_file)
            arch_config_file=arch_config_file,
            arch_config_schema_file=arch_schema_config,
        )
    except Exception as e:
        log.info(f"Exiting archgw up: validation failed")
        log.info(f"Error: {str(e)}")
        sys.exit(1)
    log.info("Starging arch model server and arch gateway")
@ -213,14 +211,7 @@ def up(file, path, service):
                else:
                    env_stage[access_key] = env_file_dict[access_key]
    with open(
        pkg_resources.resource_filename(__name__, "../config/env.list"), "w"
    ) as file:
        for key, value in env_stage.items():
            file.write(f"{key}={value}\n")
    env.update(env_stage)
    env["ARCH_CONFIG_FILE"] = arch_config_file
    if service == SERVICE_NAME_ARCHGW:
        start_arch(arch_config_file, env)
--- a/arch/tools/cli/utils.py
+++ b/arch/tools/cli/utils.py
@ -1,11 +1,8 @@
 import subprocess
 import os
 import time
 import select
 import shlex
 import yaml
 import json
 import logging
 import docker
 from cli.consts import ARCHGW_DOCKER_IMAGE, ARCHGW_DOCKER_NAME
 logging.basicConfig(
    level=logging.INFO,
@ -22,72 +19,39 @@ def getLogger(name="cli"):
 log = getLogger(__name__)
-def run_docker_compose_ps(compose_file, env):
+def validate_schema(arch_config_file: str) -> None:
    """
    Check if all Docker Compose services are in a healthy state.
    Args:
        path (str): The path where the docker-compose.yml file is located.
    """
    try:
-        # Run `docker compose ps` to get the health status of each service.
+        client = docker.from_env()
-        # This should be a non-blocking call so using subprocess.Popen(...)
+        # Run the container with detach=True to avoid blocking main process
-        ps_process = subprocess.Popen(
+        container = client.containers.run(
-            [
+            image=ARCHGW_DOCKER_IMAGE,
-                "docker",
+            volumes={
-                "compose",
+                f"{arch_config_file}": {
-                "-p",
+                    "bind": "/app/arch_config.yaml",
-                "arch",
+                    "mode": "ro",
-                "ps",
+                },
-                "--format",
+            },
-                "table{{.Service}}\t{{.State}}\t{{.Ports}}",
+            entrypoint=["python", "config_generator.py"],
-            ],
+            detach=True,
            cwd=os.path.dirname(compose_file),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            start_new_session=True,
            env=env,
        )
        # Capture the output of `docker-compose ps`
        services_status, error_output = ps_process.communicate()
-        # Check if there is any error output
+        # Wait for the container to finish and get the exit code
-        if error_output:
+        exit_code = container.wait()
-            log.info(
+
-                f"Error while checking service status:\n{error_output}",
+        # Check exit code for validation success
-                file=os.sys.stderr,
+        if exit_code["StatusCode"] != 0:
            # Validation failed (non-zero exit code)
            logs = container.logs().decode()  # Get container logs for debugging
            raise ValueError(
                f"Validation failed. Container exited with code {exit_code}.\nLogs:\n{logs}"
            )
            return {}
-        services = parse_docker_compose_ps_output(services_status)
+        # Successful validation (exit code 0)
-        return services
+        log.info("Schema validation successful!")
-    except subprocess.CalledProcessError as e:
+    except docker.errors.APIError as e:
-        log.info(f"Failed to check service status. Error:\n{e.stderr}")
+        # Handle container creation error
-        return e
+        raise ValueError(f"Failed to create container: {e}")
 # Helper method to print service status
 def print_service_status(services):
    log.info(f"{'Service Name':<25} {'State':<20} {'Ports'}")
    log.info("=" * 72)
    for service_name, info in services.items():
        status = info["STATE"]
        ports = info["PORTS"]
        log.info(f"{service_name:<25} {status:<20} {ports}")
 # check for states based on the states passed in
 def check_services_state(services, states):
    for service_name, service_info in services.items():
        status = service_info[
            "STATE"
        ].lower()  # Convert status to lowercase for easier comparison
        if any(state in status for state in states):
            return True
    return False
 def get_llm_provider_access_keys(arch_config_file):
@ -127,28 +91,3 @@ def load_env_file_to_dict(file_path):
                env_dict[key] = value
    return env_dict
 def parse_docker_compose_ps_output(output):
    # Split the output into lines
    lines = output.strip().splitlines()
    # Extract the headers (first row) and the rest of the data
    headers = lines[0].split()
    service_data = lines[1:]
    # Initialize the result dictionary
    services = {}
    # Iterate over each line of data after the headers
    for line in service_data:
        # Split the line by tabs or multiple spaces
        parts = line.split()
        # Create a dictionary entry using the header names
        service_info = {headers[1]: parts[1], headers[2]: parts[2]}  # State  # Ports
        # Add to the result dictionary using the service name as the key
        services[parts[0]] = service_info
    return services
--- a/arch/tools/poetry.lock
+++ b/arch/tools/poetry.lock
--- a/arch/tools/pyproject.toml
+++ b/arch/tools/pyproject.toml
@ -8,22 +8,18 @@ packages = [
 ]
 readme = "README.md"
 include = [
 # Include package data (docker-compose.yaml and other files)[
    "config/docker-compose.yaml",
    "config/arch_config_schema.yaml",
 ]
 [tool.poetry.dependencies]
 python = ">=3.12"
 archgw_modelserver = "0.1.4"
 pyyaml = "^6.0.2"
-pydantic = "^2.9.2"
+pydantic = "^2.10.1"
 click = "^8.1.7"
 jinja2 = "^3.1.4"
 jsonschema = "^4.23.0"
 setuptools = "75.5.0"
 huggingface_hub = "^0.26.0"
 docker = "^7.1.0"
 python-dotenv = "^1.0.1"
 [tool.poetry.scripts]
 archgw = "cli.main:main"
--- a/demos/hr_agent/run_demo.sh
+++ b/demos/hr_agent/run_demo.sh
@ -1,4 +1,5 @@
 #!/bin/bash
 set -e
 # Function to start the demo
 start_demo() {
--- a/demos/insurance_agent/run_demo.sh
+++ b/demos/insurance_agent/run_demo.sh
@ -1,4 +1,5 @@
 #!/bin/bash
 set -e
 # Function to start the demo
 start_demo() {
--- a/demos/network_agent/run_demo.sh
+++ b/demos/network_agent/run_demo.sh
@ -1,4 +1,5 @@
 #!/bin/bash
 set -e
 # Function to start the demo
 start_demo() {
--- a/demos/weather_forecast/run_demo.sh
+++ b/demos/weather_forecast/run_demo.sh
@ -1,4 +1,5 @@
 #!/bin/bash
 set -e
 # Function to start the demo
 start_demo() {
--- a/demos/weather_forecast_signoz/run_demo.sh
+++ b/demos/weather_forecast_signoz/run_demo.sh
@ -1,4 +1,5 @@
 #!/bin/bash
 set -e
 # Function to start the demo
 start_demo() {
--- a/e2e_tests/run_e2e_tests.sh
+++ b/e2e_tests/run_e2e_tests.sh
@ -13,13 +13,14 @@ print_debug() {
  log "Received signal to stop"
  log "Printing debug logs for model_server"
  log "===================================="
-  tail -n 500 ~/archgw_logs/modelserver.log
+  tail -n 100 ~/archgw_logs/modelserver.log
  log "Printing debug logs for docker"
  log "===================================="
-  tail -n 500 ../build.log
+  tail -n 100 ../build.log
  archgw logs --debug | tail -n 100
 }
-# trap 'print_debug' INT TERM ERR
+trap 'print_debug' INT TERM ERR
 log starting > ../build.log
--- a/model_server/app/commons/utilities.py
+++ b/model_server/app/commons/utilities.py
@ -3,7 +3,6 @@ import yaml
 import torch
 import string
 import logging
 import pkg_resources
 from openai import OpenAI
@ -11,13 +10,6 @@ from openai import OpenAI
 logger_instance = None
 def load_yaml_config(file_name):
    # Load the YAML file from the package
    yaml_path = pkg_resources.resource_filename("app", file_name)
    with open(yaml_path, "r") as yaml_file:
        return yaml.safe_load(yaml_file)
 def get_device():
    available_device = {
        "cpu": True,