From d3c17c7abd7034aa7d9dbba8ea83c9bfdf7a30a0 Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Fri, 15 Nov 2024 10:44:01 -0800 Subject: [PATCH] move custom tracer to llm filter (#267) --- arch/Dockerfile | 11 ++- arch/docker-compose.dev.yaml | 9 ++- arch/docker-compose.e2e.yaml | 1 + arch/docker-compose.yaml | 4 +- arch/envoy.template.yaml | 74 ++++++++++++++++++- .../trace_streamer => arch}/stream_traces.py | 13 +--- arch/supervisord.conf | 25 +++++++ arch/tools/cli/config_generator.py | 2 +- archgw.code-workspace | 19 +++-- crates/llm_gateway/src/stream_context.rs | 72 ++++++++++++++++-- crates/llm_gateway/tests/integration.rs | 2 + crates/prompt_gateway/src/http_context.rs | 37 ---------- demos/llm_routing/README.md | 19 +++++ demos/llm_routing/docker-compose.yaml | 20 ++++- demos/shared/chatbot_ui/.vscode/launch.json | 10 ++- demos/shared/chatbot_ui/common.py | 44 +++++++++-- demos/shared/chatbot_ui/run_stream.py | 42 ++++++++--- .../grafana/dashboards/envoy_overview.json | 27 +++++-- demos/shared/trace_streamer/Dockerfile | 11 --- demos/weather_forecast/docker-compose.yaml | 8 -- .../docker-compose.yaml | 8 -- e2e_tests/run_e2e_tests.sh | 10 --- 22 files changed, 335 insertions(+), 133 deletions(-) rename {demos/shared/trace_streamer => arch}/stream_traces.py (74%) create mode 100644 arch/supervisord.conf create mode 100644 demos/llm_routing/README.md delete mode 100644 demos/shared/trace_streamer/Dockerfile diff --git a/arch/Dockerfile b/arch/Dockerfile index 74cfd40a..0d96713c 100644 --- a/arch/Dockerfile +++ b/arch/Dockerfile @@ -13,16 +13,21 @@ FROM envoyproxy/envoy:v1.32-latest as envoy #Build config generator, so that we have a single build image for both Rust and Python FROM python:3.12-slim as arch -RUN apt-get update && apt-get install -y gettext-base curl && apt-get clean && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install -y gettext-base curl supervisor && apt-get clean && rm -rf /var/lib/apt/lists/* COPY --from=builder /arch/target/wasm32-wasip1/release/prompt_gateway.wasm /etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm COPY --from=builder /arch/target/wasm32-wasip1/release/llm_gateway.wasm /etc/envoy/proxy-wasm-plugins/llm_gateway.wasm COPY --from=envoy /usr/local/bin/envoy /usr/local/bin/envoy -WORKDIR /config +WORKDIR /app COPY arch/requirements.txt . RUN pip install -r requirements.txt COPY arch/tools/cli/config_generator.py . COPY arch/envoy.template.yaml . COPY arch/arch_config_schema.yaml . +COPY arch/supervisord.conf /etc/supervisor/conf.d/supervisord.conf +COPY arch/stream_traces.py . -ENTRYPOINT ["sh", "-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:debug 2>&1 | tee /var/log/envoy.log"] +RUN pip install requests +RUN touch /var/log/envoy.log + +ENTRYPOINT ["supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] diff --git a/arch/docker-compose.dev.yaml b/arch/docker-compose.dev.yaml index 134f3853..378e0eca 100644 --- a/arch/docker-compose.dev.yaml +++ b/arch/docker-compose.dev.yaml @@ -8,11 +8,11 @@ services: - "12000:12000" - "19901:9901" volumes: - - ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/config/arch_config.yaml + - ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/app/arch_config.yaml - /etc/ssl/cert.pem:/etc/ssl/cert.pem - - ./envoy.template.yaml:/config/envoy.template.yaml - - ./arch_config_schema.yaml:/config/arch_config_schema.yaml - - ./tools/cli/config_generator.py:/config/config_generator.py + - ./envoy.template.yaml:/app/envoy.template.yaml + - ./arch_config_schema.yaml:/app/arch_config_schema.yaml + - ./tools/cli/config_generator.py:/app/config_generator.py - ../crates/target/wasm32-wasip1/release/llm_gateway.wasm:/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm - ../crates/target/wasm32-wasip1/release/prompt_gateway.wasm:/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm - ~/archgw_logs:/var/log/ @@ -21,3 +21,4 @@ services: environment: - OPENAI_API_KEY=${OPENAI_API_KEY:?error} - MISTRAL_API_KEY=${MISTRAL_API_KEY:?error} + - OTEL_TRACING_HTTP_ENDPOINT=http://host.docker.internal:4318/v1/traces diff --git a/arch/docker-compose.e2e.yaml b/arch/docker-compose.e2e.yaml index 42195962..208dc1e7 100644 --- a/arch/docker-compose.e2e.yaml +++ b/arch/docker-compose.e2e.yaml @@ -16,3 +16,4 @@ services: environment: - OPENAI_API_KEY=${OPENAI_API_KEY:?error} - MISTRAL_API_KEY=${MISTRAL_API_KEY:?error} + - OTEL_TRACING_HTTP_ENDPOINT=http://host.docker.internal:4318/v1/traces diff --git a/arch/docker-compose.yaml b/arch/docker-compose.yaml index 05d2c05f..51874ead 100644 --- a/arch/docker-compose.yaml +++ b/arch/docker-compose.yaml @@ -8,11 +8,13 @@ services: - "12000:12000" - "19901:9901" volumes: - - ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/config/arch_config.yaml + - ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/app/arch_config.yaml - /etc/ssl/cert.pem:/etc/ssl/cert.pem - ~/archgw_logs:/var/log/ env_file: - env.list + environment: + - OTEL_TRACING_HTTP_ENDPOINT=http://host.docker.internal:4318/v1/traces extra_hosts: - "host.docker.internal:host-gateway" healthcheck: diff --git a/arch/envoy.template.yaml b/arch/envoy.template.yaml index f08a2b2f..52671f99 100644 --- a/arch/envoy.template.yaml +++ b/arch/envoy.template.yaml @@ -242,11 +242,66 @@ static_resources: typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router - - name: arch_listener_llm + + - name: arch_listener_http_llm address: socket_address: address: 0.0.0.0 port_value: 12000 + traffic_direction: INBOUND + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %} + generate_request_id: true + tracing: + provider: + name: envoy.tracers.opentelemetry + typed_config: + "@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig + grpc_service: + envoy_grpc: + cluster_name: opentelemetry_collector + timeout: 0.250s + service_name: arch_gateway + random_sampling: + value: {{ arch_tracing.random_sampling }} + {% endif %} + stat_prefix: arch_listener_http + codec_type: AUTO + scheme_header_transformation: + scheme_to_overwrite: https + access_log: + - name: envoy.access_loggers.file + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog + path: "/var/log/access_llm.log" + route_config: + name: local_routes + virtual_hosts: + - name: local_service + domains: + - "*" + routes: + - match: + prefix: "/" + route: + auto_host_rewrite: true + cluster: arch_listener_llm + timeout: 60s + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + + + - name: arch_listener_llm + address: + socket_address: + address: 0.0.0.0 + port_value: 12001 filter_chains: - filters: - name: envoy.filters.network.http_connection_manager @@ -479,6 +534,23 @@ static_resources: port_value: 10001 hostname: arch_prompt_gateway_listener + - name: arch_listener_llm + connect_timeout: 5s + type: LOGICAL_DNS + dns_lookup_family: V4_ONLY + lb_policy: ROUND_ROBIN + load_assignment: + cluster_name: arch_listener_llm + endpoints: + - lb_endpoints: + - endpoint: + address: + socket_address: + address: 0.0.0.0 + port_value: 12001 + hostname: arch_listener_llm + + {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %} - name: opentelemetry_collector type: STRICT_DNS diff --git a/demos/shared/trace_streamer/stream_traces.py b/arch/stream_traces.py similarity index 74% rename from demos/shared/trace_streamer/stream_traces.py rename to arch/stream_traces.py index 4f1bf20c..1a165a8a 100644 --- a/demos/shared/trace_streamer/stream_traces.py +++ b/arch/stream_traces.py @@ -1,4 +1,5 @@ import os +import sys import time import requests import logging @@ -29,14 +30,8 @@ def process_log_line(line): logging.error(f"Failed to send trace to otel-tracing: {e}") -with open(envoy_log_path, "r") as f: - # Seek to the end of the file so we only read new lines - f.seek(0, os.SEEK_END) - while True: - line = f.readline() - if not line: - time.sleep(1) - continue - tokens = line.split("prompt_gateway: upstream_llm trace details: ") +for line in sys.stdin: + if line: + tokens = line.split("gateway: upstream_llm trace details: ") if len(tokens) > 1: process_log_line(tokens[1]) diff --git a/arch/supervisord.conf b/arch/supervisord.conf new file mode 100644 index 00000000..da659e65 --- /dev/null +++ b/arch/supervisord.conf @@ -0,0 +1,25 @@ +[supervisord] +nodaemon=true + +[program:trace_streamer] +command=sh -c "tail -F /var/log/envoy.log | python stream_traces.py" +autostart=true +autorestart=false +startretries=3 +priority=1 +stdout_logfile=/dev/stdout +stderr_logfile=/dev/stderr +stdout_logfile_maxbytes = 0 +stderr_logfile_maxbytes = 0 + + +[program:envoy] +command=sh -c "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:debug 2>&1 | tee /var/log/envoy.log" +autostart=true +autorestart=true +startretries=3 +priority=2 +stdout_logfile=/dev/stdout +stderr_logfile=/dev/stderr +stdout_logfile_maxbytes = 0 +stderr_logfile_maxbytes = 0 diff --git a/arch/tools/cli/config_generator.py b/arch/tools/cli/config_generator.py index 90c1406f..b8c35562 100644 --- a/arch/tools/cli/config_generator.py +++ b/arch/tools/cli/config_generator.py @@ -6,7 +6,7 @@ from jsonschema import validate ENVOY_CONFIG_TEMPLATE_FILE = os.getenv( "ENVOY_CONFIG_TEMPLATE_FILE", "envoy.template.yaml" ) -ARCH_CONFIG_FILE = os.getenv("ARCH_CONFIG_FILE", "/config/arch_config.yaml") +ARCH_CONFIG_FILE = os.getenv("ARCH_CONFIG_FILE", "/app/arch_config.yaml") ENVOY_CONFIG_FILE_RENDERED = os.getenv( "ENVOY_CONFIG_FILE_RENDERED", "/etc/envoy/envoy.yaml" ) diff --git a/archgw.code-workspace b/archgw.code-workspace index 3e36fff2..07b23996 100644 --- a/archgw.code-workspace +++ b/archgw.code-workspace @@ -21,22 +21,25 @@ "path": "e2e_tests" }, { - "name": "demos/weather_forecast", - "path": "./demos/weather_forecast", - }, - { - "name": "demos/insurance_agent", - "path": "./demos/insurance_agent", - }, + "name": "chatbot_ui", + "path": "demos/shared/chatbot_ui" + } ], "settings": { + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true + }, }, "extensions": { "recommendations": [ "ms-python.python", "ms-python.debugpy", "rust-lang.rust-analyzer", - "humao.rest-client" + "humao.rest-client", + "github.copilot", + "eamodio.gitlens", + "ms-python.black-formatter", ] } } diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 38266f72..7e35e7f2 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -1,17 +1,18 @@ use crate::filter_context::WasmMetrics; use common::common_types::open_ai::{ ChatCompletionStreamResponseServerEvents, ChatCompletionsRequest, ChatCompletionsResponse, - StreamOptions, + Message, StreamOptions, }; use common::configuration::LlmProvider; use common::consts::{ ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, CHAT_COMPLETIONS_PATH, - RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, + RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, TRACE_PARENT_HEADER, }; use common::errors::ServerError; use common::llm_providers::LlmProviders; use common::pii::obfuscate_auth_header; use common::ratelimit::Header; +use common::tracing::{Event, Span}; use common::{ratelimit, routing, tokenizer}; use http::StatusCode; use log::{debug, trace, warn}; @@ -23,7 +24,7 @@ use std::rc::Rc; use common::stats::{IncrementingMetric, RecordingMetric}; use proxy_wasm::hostcalls::get_current_time; -use std::time::{Duration, SystemTime}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; pub struct StreamContext { context_id: u32, @@ -36,7 +37,10 @@ pub struct StreamContext { llm_provider: Option>, request_id: Option, start_time: Option, - ttft_duration: Option, // Store the duration directly + ttft_duration: Option, + ttft_time: Option, + pub traceparent: Option, + user_message: Option, } impl StreamContext { @@ -53,6 +57,9 @@ impl StreamContext { request_id: None, start_time: None, ttft_duration: None, + traceparent: None, + ttft_time: None, + user_message: None, } } fn llm_provider(&self) -> &LlmProvider { @@ -176,9 +183,10 @@ impl HttpContext for StreamContext { ); self.request_id = self.get_http_request_header(REQUEST_ID_HEADER); + self.traceparent = self.get_http_request_header(TRACE_PARENT_HEADER); //start the timing for the request using get_current_time() - let current_time = get_current_time().unwrap(); + let current_time: SystemTime = get_current_time().unwrap(); self.start_time = Some(current_time); self.ttft_duration = None; @@ -229,6 +237,13 @@ impl HttpContext for StreamContext { message.model = None; } + self.user_message = deserialized_body + .messages + .iter() + .filter(|m| m.role == "user") + .last() + .cloned(); + // override model name from the llm provider deserialized_body .model @@ -318,6 +333,52 @@ impl HttpContext for StreamContext { .output_sequence_length .record(self.response_tokens as u64); + if let Some(traceparent) = self.traceparent.as_ref() { + let since_the_epoch_ns = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + + let traceparent_tokens = traceparent.split("-").collect::>(); + if traceparent_tokens.len() != 4 { + warn!("traceparent header is invalid: {}", traceparent); + return Action::Continue; + } + let parent_trace_id = traceparent_tokens[1]; + let parent_span_id = traceparent_tokens[2]; + let mut trace_data = common::tracing::TraceData::new(); + let mut llm_span = Span::new( + "upstream_llm_time".to_string(), + parent_trace_id.to_string(), + Some(parent_span_id.to_string()), + self.start_time + .unwrap() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(), + since_the_epoch_ns, + ); + if let Some(user_message) = self.user_message.as_ref() { + if let Some(prompt) = user_message.content.as_ref() { + llm_span.add_attribute("user_prompt".to_string(), prompt.to_string()); + } + } + llm_span.add_attribute("model".to_string(), self.llm_provider().name.to_string()); + llm_span.add_event(Event::new( + "time_to_first_token".to_string(), + self.ttft_time + .unwrap() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(), + )); + trace_data.add_span(llm_span); + + let trace_data_str = serde_json::to_string(&trace_data).unwrap(); + debug!("upstream_llm trace details: {}", trace_data_str); + // send trace_data to http tracing endpoint + } + return Action::Continue; } @@ -413,6 +474,7 @@ impl HttpContext for StreamContext { if self.ttft_duration.is_none() { if let Some(start_time) = self.start_time { let current_time = get_current_time().unwrap(); + self.ttft_time = Some(current_time); match current_time.duration_since(start_time) { Ok(duration) => { let duration_ms = duration.as_millis(); diff --git a/crates/llm_gateway/tests/integration.rs b/crates/llm_gateway/tests/integration.rs index 7107b4d2..a40389aa 100644 --- a/crates/llm_gateway/tests/integration.rs +++ b/crates/llm_gateway/tests/integration.rs @@ -51,6 +51,8 @@ fn request_headers_expectations(module: &mut Tester, http_context: i32) { .expect_log(Some(LogLevel::Debug), None) .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("x-request-id")) .returning(None) + .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("traceparent")) + .returning(None) .expect_get_current_time_nanos() .returning(Some(0)) .execute_and_expect(ReturnType::Action(Action::Continue)) diff --git a/crates/prompt_gateway/src/http_context.rs b/crates/prompt_gateway/src/http_context.rs index 3174a597..aef5b491 100644 --- a/crates/prompt_gateway/src/http_context.rs +++ b/crates/prompt_gateway/src/http_context.rs @@ -18,7 +18,6 @@ use common::{ errors::ServerError, http::{CallArgs, Client}, pii::obfuscate_auth_header, - tracing::{Event, Span}, }; use http::StatusCode; use log::{debug, trace, warn}; @@ -265,42 +264,6 @@ impl HttpContext for StreamContext { } if end_of_stream && body_size == 0 { - if let Some(traceparent) = self.traceparent.as_ref() { - let since_the_epoch_ns = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_nanos(); - - let traceparent_tokens = traceparent.split("-").collect::>(); - if traceparent_tokens.len() != 4 { - warn!("traceparent header is invalid: {}", traceparent); - return Action::Continue; - } - let parent_trace_id = traceparent_tokens[1]; - let parent_span_id = traceparent_tokens[2]; - let mut trace_data = common::tracing::TraceData::new(); - let mut llm_span = Span::new( - "upstream_llm_time".to_string(), - parent_trace_id.to_string(), - Some(parent_span_id.to_string()), - self.start_upstream_llm_request_time, - since_the_epoch_ns, - ); - if let Some(prompt) = self.user_prompt.as_ref() { - if let Some(content) = prompt.content.as_ref() { - llm_span.add_attribute("user_prompt".to_string(), content.to_string()); - } - } - llm_span.add_event(Event::new( - "time_to_first_token".to_string(), - self.time_to_first_token.unwrap(), - )); - trace_data.add_span(llm_span); - - let trace_data_str = serde_json::to_string(&trace_data).unwrap(); - debug!("upstream_llm trace details: {}", trace_data_str); - // send trace_data to http tracing endpoint - } return Action::Continue; } diff --git a/demos/llm_routing/README.md b/demos/llm_routing/README.md new file mode 100644 index 00000000..f5a49971 --- /dev/null +++ b/demos/llm_routing/README.md @@ -0,0 +1,19 @@ +# LLM Routing +This demo shows how you can arch gateway to manage keys and route to appropricate LLM. + +# Starting the demo +1. Please make sure the [pre-requisites](https://github.com/katanemo/arch/?tab=readme-ov-file#prerequisites) are installed correctly +1. Start Arch + ```sh + sh run_demo.sh + ``` +1. Navigate to http://localhost:18080/ + +# Observability +Arch gateway publishes stats endpoint at http://localhost:19901/stats. In this demo we are using prometheus to pull stats from arch and we are using grafana to visalize the stats in dashboard. To see grafana dashboard follow instructions below, + +1. Navigate to http://localhost:3000/ to open grafana UI (use admin/grafana as credentials) +1. From grafana left nav click on dashboards and select "Intelligent Gateway Overview" to view arch gateway stats + +# Selecting different LLM +You can pick different LLM based on header `x-arch-llm-provider-hint` to override default LLM. diff --git a/demos/llm_routing/docker-compose.yaml b/demos/llm_routing/docker-compose.yaml index 1ce6963b..ac59499c 100644 --- a/demos/llm_routing/docker-compose.yaml +++ b/demos/llm_routing/docker-compose.yaml @@ -2,7 +2,7 @@ services: chatbot_ui: build: - context: ../../chatbot_ui + context: ../shared/chatbot_ui dockerfile: Dockerfile ports: - "18080:8080" @@ -12,3 +12,21 @@ services: - "host.docker.internal:host-gateway" volumes: - ./arch_config.yaml:/app/arch_config.yaml + + jaeger: + build: + context: ../shared/jaeger + ports: + - "16686:16686" + - "4317:4317" + - "4318:4318" + + prometheus: + build: + context: ../shared/prometheus + + grafana: + build: + context: ../shared/grafana + ports: + - "3000:3000" diff --git a/demos/shared/chatbot_ui/.vscode/launch.json b/demos/shared/chatbot_ui/.vscode/launch.json index cc443eee..e7f91d36 100644 --- a/demos/shared/chatbot_ui/.vscode/launch.json +++ b/demos/shared/chatbot_ui/.vscode/launch.json @@ -15,19 +15,21 @@ "LLM": "1", "CHAT_COMPLETION_ENDPOINT": "http://localhost:10000/v1", "STREAMING": "True", - "ARCH_CONFIG": "../demos/weather_forecast/arch_config.yaml" + "ARCH_CONFIG": "../../weather_forecast/arch_config.yaml" } }, { + "python": "${workspaceFolder}/venv/bin/python", "name": "chatbot-ui llm", - "cwd": "${workspaceFolder}/app", "type": "debugpy", "request": "launch", - "program": "run.py", + "program": "run_stream.py", "console": "integratedTerminal", "env": { "LLM": "1", - "CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1" + "CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1", + "STREAMING": "True", + "ARCH_CONFIG": "../../llm_routing/arch_config.yaml" } }, ] diff --git a/demos/shared/chatbot_ui/common.py b/demos/shared/chatbot_ui/common.py index 27838397..cfcc6556 100644 --- a/demos/shared/chatbot_ui/common.py +++ b/demos/shared/chatbot_ui/common.py @@ -1,3 +1,4 @@ +from datetime import datetime import json import logging import os @@ -159,13 +160,44 @@ def get_prompt_targets(): config = yaml.safe_load(file) available_tools = [] - for target in config["prompt_targets"]: - if not target.get("default", False): - available_tools.append( - convert_prompt_target_to_openai_format(target) - ) + if "prompt_targets" in config: + for target in config["prompt_targets"]: + if not target.get("default", False): + available_tools.append( + convert_prompt_target_to_openai_format(target) + ) + + return {tool["name"]: tool["info"] for tool in available_tools} + elif "llm_providers" in config: + return config["llm_providers"] - return {tool["name"]: tool["info"] for tool in available_tools} except Exception as e: log.info(e) return None + + +def get_llm_models(): + try: + with open(os.getenv("ARCH_CONFIG", "arch_config.yaml"), "r") as file: + config = yaml.safe_load(file) + + available_models = [""] + default_llm = None + for llm_providers in config["llm_providers"]: + if llm_providers.get("default", False): + default_llm = llm_providers["name"] + else: + available_models.append(llm_providers["name"]) + + # place default model at the beginning of the list + if default_llm: + available_models.insert(0, default_llm) + return available_models + except Exception as e: + log.info(e) + return [] + + +def format_log(message): + time_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3] + return f"{time_now} - {message}" diff --git a/demos/shared/chatbot_ui/run_stream.py b/demos/shared/chatbot_ui/run_stream.py index bd4eab56..407d4c05 100644 --- a/demos/shared/chatbot_ui/run_stream.py +++ b/demos/shared/chatbot_ui/run_stream.py @@ -8,7 +8,7 @@ from typing import List, Optional, Tuple from openai import OpenAI from dotenv import load_dotenv -from common import get_prompt_targets, process_stream_chunk +from common import format_log, get_llm_models, get_prompt_targets, process_stream_chunk load_dotenv() @@ -36,20 +36,28 @@ CSS_STYLE = """ footer {visibility: hidden} """ -client = OpenAI( - api_key="--", - base_url=CHAT_COMPLETION_ENDPOINT, -) - def chat( query: Optional[str], conversation: Optional[List[Tuple[str, str]]], history: List[dict], + debug_output: str, + model_selector: str, ): history.append({"role": "user", "content": query}) + if debug_output is None: + debug_output = "" + try: + headers = {} + if model_selector and model_selector != "": + headers["x-arch-llm-provider-hint"] = model_selector + client = OpenAI( + api_key="--", + base_url=CHAT_COMPLETION_ENDPOINT, + default_headers=headers, + ) response = client.chat.completions.create( # we select model from arch_config file model="--", @@ -65,15 +73,20 @@ def chat( conversation.append((query, "")) + model_is_set = False for chunk in response: tokens = process_stream_chunk(chunk, history) + if tokens and not model_is_set: + model_is_set = True + model = history[-1]["model"] + debug_output = debug_output + "\n" + format_log(f"model: {model}") if tokens: conversation[-1] = ( conversation[-1][0], conversation[-1][1] + tokens, ) - yield "", conversation, history + yield "", conversation, history, debug_output, model_selector def main(): @@ -94,8 +107,17 @@ def main(): value=get_prompt_targets(), show_indices=False, elem_classes="json-container", - min_height="95vh", + min_height="50vh", ) + model_selector_textbox = gr.Dropdown( + get_llm_models(), + label="override model", + elem_classes="dropdown", + ) + debug_output = gr.TextArea( + label="debug output", + elem_classes="debug_output", + ) with gr.Column(scale=2): chatbot = gr.Chatbot( @@ -110,7 +132,9 @@ def main(): ) textbox.submit( - chat, [textbox, chatbot, history], [textbox, chatbot, history] + chat, + [textbox, chatbot, history, debug_output, model_selector_textbox], + [textbox, chatbot, history, debug_output, model_selector_textbox], ) demo.launch(server_name="0.0.0.0", server_port=8080, show_error=True, debug=True) diff --git a/demos/shared/grafana/dashboards/envoy_overview.json b/demos/shared/grafana/dashboards/envoy_overview.json index 4089dade..e710e748 100644 --- a/demos/shared/grafana/dashboards/envoy_overview.json +++ b/demos/shared/grafana/dashboards/envoy_overview.json @@ -190,8 +190,8 @@ "targets": [ { "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(input_sequence_length_bucket[1h])))", + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by(le) (rate(input_sequence_length_bucket[5m])))", "fullMetaSearch": false, "includeNullMetadata": false, "legendFormat": "__auto", @@ -200,7 +200,7 @@ "useBackend": false } ], - "title": "input sequence length (p50)", + "title": "input sequence length (p90)", "type": "timeseries" }, { @@ -305,7 +305,7 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "histogram_quantile(0.5, sum(rate(output_sequence_length_bucket[1h])) by(le))", + "expr": "histogram_quantile(0.9, sum(rate(output_sequence_length_bucket[5m])) by(le))", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, @@ -315,7 +315,7 @@ "useBackend": false } ], - "title": "output sequence length (p50)", + "title": "output sequence length (p90)", "type": "timeseries" }, { @@ -415,7 +415,11 @@ { "disableTextWrap": false, "editorMode": "code", +<<<<<<< HEAD + "expr": "histogram_quantile(0.9, sum by(le) (rate(time_to_first_token_bucket[5m])))", +======= "expr": "histogram_quantile(0.5, sum by(le) (rate(time_to_first_token_bucket[1h])))", +>>>>>>> main "fullMetaSearch": false, "includeNullMetadata": false, "legendFormat": "__auto", @@ -424,7 +428,7 @@ "useBackend": false } ], - "title": "time to first token (p50)", + "title": "time to first token (p90)", "type": "timeseries" }, { @@ -539,20 +543,29 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, +<<<<<<< HEAD + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum(rate(request_latency_bucket[60m])) by (le))", +======= "disableTextWrap": false, "editorMode": "builder", "expr": "histogram_quantile(0.5, sum by(le) (rate(request_latency_bucket[1h])))", "fullMetaSearch": false, +>>>>>>> main "hide": false, "includeNullMetadata": false, "instant": false, "legendFormat": "__auto", "range": true, +<<<<<<< HEAD + "refId": "B" +======= "refId": "A", "useBackend": false +>>>>>>> main } ], - "title": "request latency (p50)", + "title": "request latency (p90)", "type": "timeseries" }, { diff --git a/demos/shared/trace_streamer/Dockerfile b/demos/shared/trace_streamer/Dockerfile deleted file mode 100644 index 189c650a..00000000 --- a/demos/shared/trace_streamer/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM python:3.12-slim as arch - -WORKDIR /app - -RUN pip install requests -COPY stream_traces.py . - -RUN mkdir -p /var/log -RUN touch /var/log/envoy.log - -CMD ["python", "stream_traces.py"] diff --git a/demos/weather_forecast/docker-compose.yaml b/demos/weather_forecast/docker-compose.yaml index 7d074ae4..fdaa7fcd 100644 --- a/demos/weather_forecast/docker-compose.yaml +++ b/demos/weather_forecast/docker-compose.yaml @@ -30,14 +30,6 @@ services: - "4317:4317" - "4318:4318" - trace_streamer: - build: - context: ../shared/trace_streamer - environment: - - OTEL_TRACING_HTTP_ENDPOINT=http://jaeger:4318/v1/traces - volumes: - - ~/archgw_logs:/var/log/ - prometheus: build: context: ../shared/prometheus diff --git a/demos/weather_forecast_signoz/docker-compose.yaml b/demos/weather_forecast_signoz/docker-compose.yaml index 56d7c9da..1c23f464 100644 --- a/demos/weather_forecast_signoz/docker-compose.yaml +++ b/demos/weather_forecast_signoz/docker-compose.yaml @@ -25,14 +25,6 @@ services: volumes: - ./arch_config.yaml:/app/arch_config.yaml - trace_streamer: - build: - context: ../shared/trace_streamer - environment: - - OTEL_TRACING_HTTP_ENDPOINT=http://otel-collector:4318/v1/traces - volumes: - - ~/archgw_logs:/var/log/ - prometheus: build: context: ../shared/prometheus diff --git a/e2e_tests/run_e2e_tests.sh b/e2e_tests/run_e2e_tests.sh index 16ef22db..d69d4af5 100644 --- a/e2e_tests/run_e2e_tests.sh +++ b/e2e_tests/run_e2e_tests.sh @@ -29,32 +29,24 @@ cd ../demos/weather_forecast docker compose up weather_forecast_service --build -d cd - -print_disk_usage - log building and install model server log ================================= cd ../model_server poetry install cd - -print_disk_usage - log building and installing archgw cli log ================================== cd ../arch/tools sh build_cli.sh cd - -print_disk_usage - log building docker image for arch gateway log ====================================== cd ../ archgw build cd - -print_disk_usage - log startup arch gateway with function calling demo cd .. tail -F ~/archgw_logs/modelserver.log & @@ -64,8 +56,6 @@ archgw up demos/weather_forecast/arch_config.yaml kill $model_server_tail_pid cd - -print_disk_usage - log running e2e tests log ================= poetry install