plano/arch/tools/cli/core.py

187 lines
5.7 KiB
Python

import subprocess
import os
import time
import sys
import yaml
from cli.utils import getLogger
from cli.consts import (
ARCHGW_DOCKER_IMAGE,
ARCHGW_DOCKER_NAME,
KATANEMO_LOCAL_MODEL_LIST,
)
from huggingface_hub import snapshot_download
import subprocess
from cli.docker_cli import (
docker_container_status,
docker_remove_container,
docker_start_archgw_detached,
docker_stop_container,
health_check_endpoint,
stream_gateway_logs,
)
log = getLogger(__name__)
def _get_gateway_ports(arch_config_file: str) -> tuple:
PROMPT_GATEWAY_DEFAULT_PORT = 10000
LLM_GATEWAY_DEFAULT_PORT = 12000
# parse arch_config_file yaml file and get prompt_gateway_port
arch_config_dict = {}
with open(arch_config_file) as f:
arch_config_dict = yaml.safe_load(f)
prompt_gateway_port = (
arch_config_dict.get("listeners", {})
.get("ingress_traffic", {})
.get("port", PROMPT_GATEWAY_DEFAULT_PORT)
)
llm_gateway_port = (
arch_config_dict.get("listeners", {})
.get("egress_traffic", {})
.get("port", LLM_GATEWAY_DEFAULT_PORT)
)
return prompt_gateway_port, llm_gateway_port
def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
"""
Start Docker Compose in detached mode and stream logs until services are healthy.
Args:
path (str): The path where the prompt_config.yml file is located.
log_timeout (int): Time in seconds to show logs before checking for healthy state.
"""
log.info(
f"Starting arch gateway, image name: {ARCHGW_DOCKER_NAME}, tag: {ARCHGW_DOCKER_IMAGE}"
)
try:
archgw_container_status = docker_container_status(ARCHGW_DOCKER_NAME)
if archgw_container_status != "not found":
log.info("archgw found in docker, stopping and removing it")
docker_stop_container(ARCHGW_DOCKER_NAME)
docker_remove_container(ARCHGW_DOCKER_NAME)
prompt_gateway_port, llm_gateway_port = _get_gateway_ports(arch_config_file)
return_code, _, archgw_stderr = docker_start_archgw_detached(
arch_config_file,
os.path.expanduser("~/archgw_logs"),
env,
prompt_gateway_port,
llm_gateway_port,
)
if return_code != 0:
log.info("Failed to start arch gateway: " + str(return_code))
log.info("stderr: " + archgw_stderr)
sys.exit(1)
start_time = time.time()
while True:
prompt_gateway_health_check_status = health_check_endpoint(
f"http://localhost:{prompt_gateway_port}/healthz"
)
llm_gateway_health_check_status = health_check_endpoint(
f"http://localhost:{llm_gateway_port}/healthz"
)
archgw_status = docker_container_status(ARCHGW_DOCKER_NAME)
current_time = time.time()
elapsed_time = current_time - start_time
if archgw_status == "exited":
log.info("archgw container exited unexpectedly.")
stream_gateway_logs(follow=False)
sys.exit(1)
# Check if timeout is reached
if elapsed_time > log_timeout:
log.info(f"stopping log monitoring after {log_timeout} seconds.")
stream_gateway_logs(follow=False)
sys.exit(1)
if prompt_gateway_health_check_status or llm_gateway_health_check_status:
log.info("archgw is running and is healthy!")
break
else:
log.info(f"archgw status: {archgw_status}, health status: starting")
time.sleep(1)
if foreground:
stream_gateway_logs(follow=True)
except KeyboardInterrupt:
log.info("Keyboard interrupt received, stopping arch gateway service.")
stop_docker_container()
def stop_docker_container(service=ARCHGW_DOCKER_NAME):
"""
Shutdown all Docker Compose services by running `docker-compose down`.
Args:
path (str): The path where the docker-compose.yml file is located.
"""
log.info(f"Shutting down {service} service.")
try:
subprocess.run(
["docker", "stop", service],
)
subprocess.run(
["docker", "rm", service],
)
log.info(f"Successfully shut down {service} service.")
except subprocess.CalledProcessError as e:
log.info(f"Failed to shut down services: {str(e)}")
def download_models_from_hf():
for model in KATANEMO_LOCAL_MODEL_LIST:
log.info(f"Downloading model: {model}")
snapshot_download(repo_id=model)
def start_arch_modelserver(foreground):
"""
Start the model server. This assumes that the archgw_modelserver package is installed locally
"""
try:
log.info("archgw_modelserver restart")
if foreground:
subprocess.run(
["archgw_modelserver", "start", "--foreground"],
check=True,
)
else:
subprocess.run(
["archgw_modelserver", "start"],
check=True,
)
except subprocess.CalledProcessError as e:
log.info(f"Failed to start model_server. Please check archgw_modelserver logs")
sys.exit(1)
def stop_arch_modelserver():
"""
Stop the model server. This assumes that the archgw_modelserver package is installed locally
"""
try:
subprocess.run(
["archgw_modelserver", "stop"],
check=True,
)
except subprocess.CalledProcessError as e:
log.info(f"Failed to start model_server. Please check archgw_modelserver logs")
sys.exit(1)