move custom tracer to llm filter (#267)

This commit is contained in:
Adil Hafeez 2024-11-15 10:44:01 -08:00 committed by GitHub
parent 1d229cba8f
commit d3c17c7abd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 335 additions and 133 deletions

View file

@ -13,16 +13,21 @@ FROM envoyproxy/envoy:v1.32-latest as envoy
#Build config generator, so that we have a single build image for both Rust and Python #Build config generator, so that we have a single build image for both Rust and Python
FROM python:3.12-slim as arch FROM python:3.12-slim as arch
RUN apt-get update && apt-get install -y gettext-base curl && apt-get clean && rm -rf /var/lib/apt/lists/* RUN apt-get update && apt-get install -y gettext-base curl supervisor && apt-get clean && rm -rf /var/lib/apt/lists/*
COPY --from=builder /arch/target/wasm32-wasip1/release/prompt_gateway.wasm /etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm COPY --from=builder /arch/target/wasm32-wasip1/release/prompt_gateway.wasm /etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm
COPY --from=builder /arch/target/wasm32-wasip1/release/llm_gateway.wasm /etc/envoy/proxy-wasm-plugins/llm_gateway.wasm COPY --from=builder /arch/target/wasm32-wasip1/release/llm_gateway.wasm /etc/envoy/proxy-wasm-plugins/llm_gateway.wasm
COPY --from=envoy /usr/local/bin/envoy /usr/local/bin/envoy COPY --from=envoy /usr/local/bin/envoy /usr/local/bin/envoy
WORKDIR /config WORKDIR /app
COPY arch/requirements.txt . COPY arch/requirements.txt .
RUN pip install -r requirements.txt RUN pip install -r requirements.txt
COPY arch/tools/cli/config_generator.py . COPY arch/tools/cli/config_generator.py .
COPY arch/envoy.template.yaml . COPY arch/envoy.template.yaml .
COPY arch/arch_config_schema.yaml . COPY arch/arch_config_schema.yaml .
COPY arch/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
COPY arch/stream_traces.py .
ENTRYPOINT ["sh", "-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:debug 2>&1 | tee /var/log/envoy.log"] RUN pip install requests
RUN touch /var/log/envoy.log
ENTRYPOINT ["supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]

View file

@ -8,11 +8,11 @@ services:
- "12000:12000" - "12000:12000"
- "19901:9901" - "19901:9901"
volumes: volumes:
- ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/config/arch_config.yaml - ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/app/arch_config.yaml
- /etc/ssl/cert.pem:/etc/ssl/cert.pem - /etc/ssl/cert.pem:/etc/ssl/cert.pem
- ./envoy.template.yaml:/config/envoy.template.yaml - ./envoy.template.yaml:/app/envoy.template.yaml
- ./arch_config_schema.yaml:/config/arch_config_schema.yaml - ./arch_config_schema.yaml:/app/arch_config_schema.yaml
- ./tools/cli/config_generator.py:/config/config_generator.py - ./tools/cli/config_generator.py:/app/config_generator.py
- ../crates/target/wasm32-wasip1/release/llm_gateway.wasm:/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm - ../crates/target/wasm32-wasip1/release/llm_gateway.wasm:/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm
- ../crates/target/wasm32-wasip1/release/prompt_gateway.wasm:/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm - ../crates/target/wasm32-wasip1/release/prompt_gateway.wasm:/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm
- ~/archgw_logs:/var/log/ - ~/archgw_logs:/var/log/
@ -21,3 +21,4 @@ services:
environment: environment:
- OPENAI_API_KEY=${OPENAI_API_KEY:?error} - OPENAI_API_KEY=${OPENAI_API_KEY:?error}
- MISTRAL_API_KEY=${MISTRAL_API_KEY:?error} - MISTRAL_API_KEY=${MISTRAL_API_KEY:?error}
- OTEL_TRACING_HTTP_ENDPOINT=http://host.docker.internal:4318/v1/traces

View file

@ -16,3 +16,4 @@ services:
environment: environment:
- OPENAI_API_KEY=${OPENAI_API_KEY:?error} - OPENAI_API_KEY=${OPENAI_API_KEY:?error}
- MISTRAL_API_KEY=${MISTRAL_API_KEY:?error} - MISTRAL_API_KEY=${MISTRAL_API_KEY:?error}
- OTEL_TRACING_HTTP_ENDPOINT=http://host.docker.internal:4318/v1/traces

View file

@ -8,11 +8,13 @@ services:
- "12000:12000" - "12000:12000"
- "19901:9901" - "19901:9901"
volumes: volumes:
- ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/config/arch_config.yaml - ${ARCH_CONFIG_FILE:-../demos/weather_forecast/arch_config.yaml}:/app/arch_config.yaml
- /etc/ssl/cert.pem:/etc/ssl/cert.pem - /etc/ssl/cert.pem:/etc/ssl/cert.pem
- ~/archgw_logs:/var/log/ - ~/archgw_logs:/var/log/
env_file: env_file:
- env.list - env.list
environment:
- OTEL_TRACING_HTTP_ENDPOINT=http://host.docker.internal:4318/v1/traces
extra_hosts: extra_hosts:
- "host.docker.internal:host-gateway" - "host.docker.internal:host-gateway"
healthcheck: healthcheck:

View file

@ -242,11 +242,66 @@ static_resources:
typed_config: typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
- name: arch_listener_llm
- name: arch_listener_http_llm
address: address:
socket_address: socket_address:
address: 0.0.0.0 address: 0.0.0.0
port_value: 12000 port_value: 12000
traffic_direction: INBOUND
filter_chains:
- filters:
- name: envoy.filters.network.http_connection_manager
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
{% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %}
generate_request_id: true
tracing:
provider:
name: envoy.tracers.opentelemetry
typed_config:
"@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig
grpc_service:
envoy_grpc:
cluster_name: opentelemetry_collector
timeout: 0.250s
service_name: arch_gateway
random_sampling:
value: {{ arch_tracing.random_sampling }}
{% endif %}
stat_prefix: arch_listener_http
codec_type: AUTO
scheme_header_transformation:
scheme_to_overwrite: https
access_log:
- name: envoy.access_loggers.file
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
path: "/var/log/access_llm.log"
route_config:
name: local_routes
virtual_hosts:
- name: local_service
domains:
- "*"
routes:
- match:
prefix: "/"
route:
auto_host_rewrite: true
cluster: arch_listener_llm
timeout: 60s
http_filters:
- name: envoy.filters.http.router
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
- name: arch_listener_llm
address:
socket_address:
address: 0.0.0.0
port_value: 12001
filter_chains: filter_chains:
- filters: - filters:
- name: envoy.filters.network.http_connection_manager - name: envoy.filters.network.http_connection_manager
@ -479,6 +534,23 @@ static_resources:
port_value: 10001 port_value: 10001
hostname: arch_prompt_gateway_listener hostname: arch_prompt_gateway_listener
- name: arch_listener_llm
connect_timeout: 5s
type: LOGICAL_DNS
dns_lookup_family: V4_ONLY
lb_policy: ROUND_ROBIN
load_assignment:
cluster_name: arch_listener_llm
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: 0.0.0.0
port_value: 12001
hostname: arch_listener_llm
{% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %} {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %}
- name: opentelemetry_collector - name: opentelemetry_collector
type: STRICT_DNS type: STRICT_DNS

View file

@ -1,4 +1,5 @@
import os import os
import sys
import time import time
import requests import requests
import logging import logging
@ -29,14 +30,8 @@ def process_log_line(line):
logging.error(f"Failed to send trace to otel-tracing: {e}") logging.error(f"Failed to send trace to otel-tracing: {e}")
with open(envoy_log_path, "r") as f: for line in sys.stdin:
# Seek to the end of the file so we only read new lines if line:
f.seek(0, os.SEEK_END) tokens = line.split("gateway: upstream_llm trace details: ")
while True:
line = f.readline()
if not line:
time.sleep(1)
continue
tokens = line.split("prompt_gateway: upstream_llm trace details: ")
if len(tokens) > 1: if len(tokens) > 1:
process_log_line(tokens[1]) process_log_line(tokens[1])

25
arch/supervisord.conf Normal file
View file

@ -0,0 +1,25 @@
[supervisord]
nodaemon=true
[program:trace_streamer]
command=sh -c "tail -F /var/log/envoy.log | python stream_traces.py"
autostart=true
autorestart=false
startretries=3
priority=1
stdout_logfile=/dev/stdout
stderr_logfile=/dev/stderr
stdout_logfile_maxbytes = 0
stderr_logfile_maxbytes = 0
[program:envoy]
command=sh -c "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:debug 2>&1 | tee /var/log/envoy.log"
autostart=true
autorestart=true
startretries=3
priority=2
stdout_logfile=/dev/stdout
stderr_logfile=/dev/stderr
stdout_logfile_maxbytes = 0
stderr_logfile_maxbytes = 0

View file

@ -6,7 +6,7 @@ from jsonschema import validate
ENVOY_CONFIG_TEMPLATE_FILE = os.getenv( ENVOY_CONFIG_TEMPLATE_FILE = os.getenv(
"ENVOY_CONFIG_TEMPLATE_FILE", "envoy.template.yaml" "ENVOY_CONFIG_TEMPLATE_FILE", "envoy.template.yaml"
) )
ARCH_CONFIG_FILE = os.getenv("ARCH_CONFIG_FILE", "/config/arch_config.yaml") ARCH_CONFIG_FILE = os.getenv("ARCH_CONFIG_FILE", "/app/arch_config.yaml")
ENVOY_CONFIG_FILE_RENDERED = os.getenv( ENVOY_CONFIG_FILE_RENDERED = os.getenv(
"ENVOY_CONFIG_FILE_RENDERED", "/etc/envoy/envoy.yaml" "ENVOY_CONFIG_FILE_RENDERED", "/etc/envoy/envoy.yaml"
) )

View file

@ -21,22 +21,25 @@
"path": "e2e_tests" "path": "e2e_tests"
}, },
{ {
"name": "demos/weather_forecast", "name": "chatbot_ui",
"path": "./demos/weather_forecast", "path": "demos/shared/chatbot_ui"
}, }
{
"name": "demos/insurance_agent",
"path": "./demos/insurance_agent",
},
], ],
"settings": { "settings": {
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnSave": true
},
}, },
"extensions": { "extensions": {
"recommendations": [ "recommendations": [
"ms-python.python", "ms-python.python",
"ms-python.debugpy", "ms-python.debugpy",
"rust-lang.rust-analyzer", "rust-lang.rust-analyzer",
"humao.rest-client" "humao.rest-client",
"github.copilot",
"eamodio.gitlens",
"ms-python.black-formatter",
] ]
} }
} }

View file

@ -1,17 +1,18 @@
use crate::filter_context::WasmMetrics; use crate::filter_context::WasmMetrics;
use common::common_types::open_ai::{ use common::common_types::open_ai::{
ChatCompletionStreamResponseServerEvents, ChatCompletionsRequest, ChatCompletionsResponse, ChatCompletionStreamResponseServerEvents, ChatCompletionsRequest, ChatCompletionsResponse,
StreamOptions, Message, StreamOptions,
}; };
use common::configuration::LlmProvider; use common::configuration::LlmProvider;
use common::consts::{ use common::consts::{
ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, CHAT_COMPLETIONS_PATH, ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, CHAT_COMPLETIONS_PATH,
RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, TRACE_PARENT_HEADER,
}; };
use common::errors::ServerError; use common::errors::ServerError;
use common::llm_providers::LlmProviders; use common::llm_providers::LlmProviders;
use common::pii::obfuscate_auth_header; use common::pii::obfuscate_auth_header;
use common::ratelimit::Header; use common::ratelimit::Header;
use common::tracing::{Event, Span};
use common::{ratelimit, routing, tokenizer}; use common::{ratelimit, routing, tokenizer};
use http::StatusCode; use http::StatusCode;
use log::{debug, trace, warn}; use log::{debug, trace, warn};
@ -23,7 +24,7 @@ use std::rc::Rc;
use common::stats::{IncrementingMetric, RecordingMetric}; use common::stats::{IncrementingMetric, RecordingMetric};
use proxy_wasm::hostcalls::get_current_time; use proxy_wasm::hostcalls::get_current_time;
use std::time::{Duration, SystemTime}; use std::time::{Duration, SystemTime, UNIX_EPOCH};
pub struct StreamContext { pub struct StreamContext {
context_id: u32, context_id: u32,
@ -36,7 +37,10 @@ pub struct StreamContext {
llm_provider: Option<Rc<LlmProvider>>, llm_provider: Option<Rc<LlmProvider>>,
request_id: Option<String>, request_id: Option<String>,
start_time: Option<SystemTime>, start_time: Option<SystemTime>,
ttft_duration: Option<Duration>, // Store the duration directly ttft_duration: Option<Duration>,
ttft_time: Option<SystemTime>,
pub traceparent: Option<String>,
user_message: Option<Message>,
} }
impl StreamContext { impl StreamContext {
@ -53,6 +57,9 @@ impl StreamContext {
request_id: None, request_id: None,
start_time: None, start_time: None,
ttft_duration: None, ttft_duration: None,
traceparent: None,
ttft_time: None,
user_message: None,
} }
} }
fn llm_provider(&self) -> &LlmProvider { fn llm_provider(&self) -> &LlmProvider {
@ -176,9 +183,10 @@ impl HttpContext for StreamContext {
); );
self.request_id = self.get_http_request_header(REQUEST_ID_HEADER); self.request_id = self.get_http_request_header(REQUEST_ID_HEADER);
self.traceparent = self.get_http_request_header(TRACE_PARENT_HEADER);
//start the timing for the request using get_current_time() //start the timing for the request using get_current_time()
let current_time = get_current_time().unwrap(); let current_time: SystemTime = get_current_time().unwrap();
self.start_time = Some(current_time); self.start_time = Some(current_time);
self.ttft_duration = None; self.ttft_duration = None;
@ -229,6 +237,13 @@ impl HttpContext for StreamContext {
message.model = None; message.model = None;
} }
self.user_message = deserialized_body
.messages
.iter()
.filter(|m| m.role == "user")
.last()
.cloned();
// override model name from the llm provider // override model name from the llm provider
deserialized_body deserialized_body
.model .model
@ -318,6 +333,52 @@ impl HttpContext for StreamContext {
.output_sequence_length .output_sequence_length
.record(self.response_tokens as u64); .record(self.response_tokens as u64);
if let Some(traceparent) = self.traceparent.as_ref() {
let since_the_epoch_ns = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_nanos();
let traceparent_tokens = traceparent.split("-").collect::<Vec<&str>>();
if traceparent_tokens.len() != 4 {
warn!("traceparent header is invalid: {}", traceparent);
return Action::Continue;
}
let parent_trace_id = traceparent_tokens[1];
let parent_span_id = traceparent_tokens[2];
let mut trace_data = common::tracing::TraceData::new();
let mut llm_span = Span::new(
"upstream_llm_time".to_string(),
parent_trace_id.to_string(),
Some(parent_span_id.to_string()),
self.start_time
.unwrap()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_nanos(),
since_the_epoch_ns,
);
if let Some(user_message) = self.user_message.as_ref() {
if let Some(prompt) = user_message.content.as_ref() {
llm_span.add_attribute("user_prompt".to_string(), prompt.to_string());
}
}
llm_span.add_attribute("model".to_string(), self.llm_provider().name.to_string());
llm_span.add_event(Event::new(
"time_to_first_token".to_string(),
self.ttft_time
.unwrap()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_nanos(),
));
trace_data.add_span(llm_span);
let trace_data_str = serde_json::to_string(&trace_data).unwrap();
debug!("upstream_llm trace details: {}", trace_data_str);
// send trace_data to http tracing endpoint
}
return Action::Continue; return Action::Continue;
} }
@ -413,6 +474,7 @@ impl HttpContext for StreamContext {
if self.ttft_duration.is_none() { if self.ttft_duration.is_none() {
if let Some(start_time) = self.start_time { if let Some(start_time) = self.start_time {
let current_time = get_current_time().unwrap(); let current_time = get_current_time().unwrap();
self.ttft_time = Some(current_time);
match current_time.duration_since(start_time) { match current_time.duration_since(start_time) {
Ok(duration) => { Ok(duration) => {
let duration_ms = duration.as_millis(); let duration_ms = duration.as_millis();

View file

@ -51,6 +51,8 @@ fn request_headers_expectations(module: &mut Tester, http_context: i32) {
.expect_log(Some(LogLevel::Debug), None) .expect_log(Some(LogLevel::Debug), None)
.expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("x-request-id")) .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("x-request-id"))
.returning(None) .returning(None)
.expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("traceparent"))
.returning(None)
.expect_get_current_time_nanos() .expect_get_current_time_nanos()
.returning(Some(0)) .returning(Some(0))
.execute_and_expect(ReturnType::Action(Action::Continue)) .execute_and_expect(ReturnType::Action(Action::Continue))

View file

@ -18,7 +18,6 @@ use common::{
errors::ServerError, errors::ServerError,
http::{CallArgs, Client}, http::{CallArgs, Client},
pii::obfuscate_auth_header, pii::obfuscate_auth_header,
tracing::{Event, Span},
}; };
use http::StatusCode; use http::StatusCode;
use log::{debug, trace, warn}; use log::{debug, trace, warn};
@ -265,42 +264,6 @@ impl HttpContext for StreamContext {
} }
if end_of_stream && body_size == 0 { if end_of_stream && body_size == 0 {
if let Some(traceparent) = self.traceparent.as_ref() {
let since_the_epoch_ns = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_nanos();
let traceparent_tokens = traceparent.split("-").collect::<Vec<&str>>();
if traceparent_tokens.len() != 4 {
warn!("traceparent header is invalid: {}", traceparent);
return Action::Continue;
}
let parent_trace_id = traceparent_tokens[1];
let parent_span_id = traceparent_tokens[2];
let mut trace_data = common::tracing::TraceData::new();
let mut llm_span = Span::new(
"upstream_llm_time".to_string(),
parent_trace_id.to_string(),
Some(parent_span_id.to_string()),
self.start_upstream_llm_request_time,
since_the_epoch_ns,
);
if let Some(prompt) = self.user_prompt.as_ref() {
if let Some(content) = prompt.content.as_ref() {
llm_span.add_attribute("user_prompt".to_string(), content.to_string());
}
}
llm_span.add_event(Event::new(
"time_to_first_token".to_string(),
self.time_to_first_token.unwrap(),
));
trace_data.add_span(llm_span);
let trace_data_str = serde_json::to_string(&trace_data).unwrap();
debug!("upstream_llm trace details: {}", trace_data_str);
// send trace_data to http tracing endpoint
}
return Action::Continue; return Action::Continue;
} }

View file

@ -0,0 +1,19 @@
# LLM Routing
This demo shows how you can arch gateway to manage keys and route to appropricate LLM.
# Starting the demo
1. Please make sure the [pre-requisites](https://github.com/katanemo/arch/?tab=readme-ov-file#prerequisites) are installed correctly
1. Start Arch
```sh
sh run_demo.sh
```
1. Navigate to http://localhost:18080/
# Observability
Arch gateway publishes stats endpoint at http://localhost:19901/stats. In this demo we are using prometheus to pull stats from arch and we are using grafana to visalize the stats in dashboard. To see grafana dashboard follow instructions below,
1. Navigate to http://localhost:3000/ to open grafana UI (use admin/grafana as credentials)
1. From grafana left nav click on dashboards and select "Intelligent Gateway Overview" to view arch gateway stats
# Selecting different LLM
You can pick different LLM based on header `x-arch-llm-provider-hint` to override default LLM.

View file

@ -2,7 +2,7 @@ services:
chatbot_ui: chatbot_ui:
build: build:
context: ../../chatbot_ui context: ../shared/chatbot_ui
dockerfile: Dockerfile dockerfile: Dockerfile
ports: ports:
- "18080:8080" - "18080:8080"
@ -12,3 +12,21 @@ services:
- "host.docker.internal:host-gateway" - "host.docker.internal:host-gateway"
volumes: volumes:
- ./arch_config.yaml:/app/arch_config.yaml - ./arch_config.yaml:/app/arch_config.yaml
jaeger:
build:
context: ../shared/jaeger
ports:
- "16686:16686"
- "4317:4317"
- "4318:4318"
prometheus:
build:
context: ../shared/prometheus
grafana:
build:
context: ../shared/grafana
ports:
- "3000:3000"

View file

@ -15,19 +15,21 @@
"LLM": "1", "LLM": "1",
"CHAT_COMPLETION_ENDPOINT": "http://localhost:10000/v1", "CHAT_COMPLETION_ENDPOINT": "http://localhost:10000/v1",
"STREAMING": "True", "STREAMING": "True",
"ARCH_CONFIG": "../demos/weather_forecast/arch_config.yaml" "ARCH_CONFIG": "../../weather_forecast/arch_config.yaml"
} }
}, },
{ {
"python": "${workspaceFolder}/venv/bin/python",
"name": "chatbot-ui llm", "name": "chatbot-ui llm",
"cwd": "${workspaceFolder}/app",
"type": "debugpy", "type": "debugpy",
"request": "launch", "request": "launch",
"program": "run.py", "program": "run_stream.py",
"console": "integratedTerminal", "console": "integratedTerminal",
"env": { "env": {
"LLM": "1", "LLM": "1",
"CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1" "CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1",
"STREAMING": "True",
"ARCH_CONFIG": "../../llm_routing/arch_config.yaml"
} }
}, },
] ]

View file

@ -1,3 +1,4 @@
from datetime import datetime
import json import json
import logging import logging
import os import os
@ -159,13 +160,44 @@ def get_prompt_targets():
config = yaml.safe_load(file) config = yaml.safe_load(file)
available_tools = [] available_tools = []
for target in config["prompt_targets"]: if "prompt_targets" in config:
if not target.get("default", False): for target in config["prompt_targets"]:
available_tools.append( if not target.get("default", False):
convert_prompt_target_to_openai_format(target) available_tools.append(
) convert_prompt_target_to_openai_format(target)
)
return {tool["name"]: tool["info"] for tool in available_tools}
elif "llm_providers" in config:
return config["llm_providers"]
return {tool["name"]: tool["info"] for tool in available_tools}
except Exception as e: except Exception as e:
log.info(e) log.info(e)
return None return None
def get_llm_models():
try:
with open(os.getenv("ARCH_CONFIG", "arch_config.yaml"), "r") as file:
config = yaml.safe_load(file)
available_models = [""]
default_llm = None
for llm_providers in config["llm_providers"]:
if llm_providers.get("default", False):
default_llm = llm_providers["name"]
else:
available_models.append(llm_providers["name"])
# place default model at the beginning of the list
if default_llm:
available_models.insert(0, default_llm)
return available_models
except Exception as e:
log.info(e)
return []
def format_log(message):
time_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
return f"{time_now} - {message}"

View file

@ -8,7 +8,7 @@ from typing import List, Optional, Tuple
from openai import OpenAI from openai import OpenAI
from dotenv import load_dotenv from dotenv import load_dotenv
from common import get_prompt_targets, process_stream_chunk from common import format_log, get_llm_models, get_prompt_targets, process_stream_chunk
load_dotenv() load_dotenv()
@ -36,20 +36,28 @@ CSS_STYLE = """
footer {visibility: hidden} footer {visibility: hidden}
""" """
client = OpenAI(
api_key="--",
base_url=CHAT_COMPLETION_ENDPOINT,
)
def chat( def chat(
query: Optional[str], query: Optional[str],
conversation: Optional[List[Tuple[str, str]]], conversation: Optional[List[Tuple[str, str]]],
history: List[dict], history: List[dict],
debug_output: str,
model_selector: str,
): ):
history.append({"role": "user", "content": query}) history.append({"role": "user", "content": query})
if debug_output is None:
debug_output = ""
try: try:
headers = {}
if model_selector and model_selector != "":
headers["x-arch-llm-provider-hint"] = model_selector
client = OpenAI(
api_key="--",
base_url=CHAT_COMPLETION_ENDPOINT,
default_headers=headers,
)
response = client.chat.completions.create( response = client.chat.completions.create(
# we select model from arch_config file # we select model from arch_config file
model="--", model="--",
@ -65,15 +73,20 @@ def chat(
conversation.append((query, "")) conversation.append((query, ""))
model_is_set = False
for chunk in response: for chunk in response:
tokens = process_stream_chunk(chunk, history) tokens = process_stream_chunk(chunk, history)
if tokens and not model_is_set:
model_is_set = True
model = history[-1]["model"]
debug_output = debug_output + "\n" + format_log(f"model: {model}")
if tokens: if tokens:
conversation[-1] = ( conversation[-1] = (
conversation[-1][0], conversation[-1][0],
conversation[-1][1] + tokens, conversation[-1][1] + tokens,
) )
yield "", conversation, history yield "", conversation, history, debug_output, model_selector
def main(): def main():
@ -94,8 +107,17 @@ def main():
value=get_prompt_targets(), value=get_prompt_targets(),
show_indices=False, show_indices=False,
elem_classes="json-container", elem_classes="json-container",
min_height="95vh", min_height="50vh",
) )
model_selector_textbox = gr.Dropdown(
get_llm_models(),
label="override model",
elem_classes="dropdown",
)
debug_output = gr.TextArea(
label="debug output",
elem_classes="debug_output",
)
with gr.Column(scale=2): with gr.Column(scale=2):
chatbot = gr.Chatbot( chatbot = gr.Chatbot(
@ -110,7 +132,9 @@ def main():
) )
textbox.submit( textbox.submit(
chat, [textbox, chatbot, history], [textbox, chatbot, history] chat,
[textbox, chatbot, history, debug_output, model_selector_textbox],
[textbox, chatbot, history, debug_output, model_selector_textbox],
) )
demo.launch(server_name="0.0.0.0", server_port=8080, show_error=True, debug=True) demo.launch(server_name="0.0.0.0", server_port=8080, show_error=True, debug=True)

View file

@ -190,8 +190,8 @@
"targets": [ "targets": [
{ {
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "builder", "editorMode": "code",
"expr": "histogram_quantile(0.5, sum by(le) (rate(input_sequence_length_bucket[1h])))", "expr": "histogram_quantile(0.9, sum by(le) (rate(input_sequence_length_bucket[5m])))",
"fullMetaSearch": false, "fullMetaSearch": false,
"includeNullMetadata": false, "includeNullMetadata": false,
"legendFormat": "__auto", "legendFormat": "__auto",
@ -200,7 +200,7 @@
"useBackend": false "useBackend": false
} }
], ],
"title": "input sequence length (p50)", "title": "input sequence length (p90)",
"type": "timeseries" "type": "timeseries"
}, },
{ {
@ -305,7 +305,7 @@
}, },
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "code", "editorMode": "code",
"expr": "histogram_quantile(0.5, sum(rate(output_sequence_length_bucket[1h])) by(le))", "expr": "histogram_quantile(0.9, sum(rate(output_sequence_length_bucket[5m])) by(le))",
"fullMetaSearch": false, "fullMetaSearch": false,
"includeNullMetadata": false, "includeNullMetadata": false,
"instant": false, "instant": false,
@ -315,7 +315,7 @@
"useBackend": false "useBackend": false
} }
], ],
"title": "output sequence length (p50)", "title": "output sequence length (p90)",
"type": "timeseries" "type": "timeseries"
}, },
{ {
@ -415,7 +415,11 @@
{ {
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "code", "editorMode": "code",
<<<<<<< HEAD
"expr": "histogram_quantile(0.9, sum by(le) (rate(time_to_first_token_bucket[5m])))",
=======
"expr": "histogram_quantile(0.5, sum by(le) (rate(time_to_first_token_bucket[1h])))", "expr": "histogram_quantile(0.5, sum by(le) (rate(time_to_first_token_bucket[1h])))",
>>>>>>> main
"fullMetaSearch": false, "fullMetaSearch": false,
"includeNullMetadata": false, "includeNullMetadata": false,
"legendFormat": "__auto", "legendFormat": "__auto",
@ -424,7 +428,7 @@
"useBackend": false "useBackend": false
} }
], ],
"title": "time to first token (p50)", "title": "time to first token (p90)",
"type": "timeseries" "type": "timeseries"
}, },
{ {
@ -539,20 +543,29 @@
"type": "prometheus", "type": "prometheus",
"uid": "PBFA97CFB590B2093" "uid": "PBFA97CFB590B2093"
}, },
<<<<<<< HEAD
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum(rate(request_latency_bucket[60m])) by (le))",
=======
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "builder", "editorMode": "builder",
"expr": "histogram_quantile(0.5, sum by(le) (rate(request_latency_bucket[1h])))", "expr": "histogram_quantile(0.5, sum by(le) (rate(request_latency_bucket[1h])))",
"fullMetaSearch": false, "fullMetaSearch": false,
>>>>>>> main
"hide": false, "hide": false,
"includeNullMetadata": false, "includeNullMetadata": false,
"instant": false, "instant": false,
"legendFormat": "__auto", "legendFormat": "__auto",
"range": true, "range": true,
<<<<<<< HEAD
"refId": "B"
=======
"refId": "A", "refId": "A",
"useBackend": false "useBackend": false
>>>>>>> main
} }
], ],
"title": "request latency (p50)", "title": "request latency (p90)",
"type": "timeseries" "type": "timeseries"
}, },
{ {

View file

@ -1,11 +0,0 @@
FROM python:3.12-slim as arch
WORKDIR /app
RUN pip install requests
COPY stream_traces.py .
RUN mkdir -p /var/log
RUN touch /var/log/envoy.log
CMD ["python", "stream_traces.py"]

View file

@ -30,14 +30,6 @@ services:
- "4317:4317" - "4317:4317"
- "4318:4318" - "4318:4318"
trace_streamer:
build:
context: ../shared/trace_streamer
environment:
- OTEL_TRACING_HTTP_ENDPOINT=http://jaeger:4318/v1/traces
volumes:
- ~/archgw_logs:/var/log/
prometheus: prometheus:
build: build:
context: ../shared/prometheus context: ../shared/prometheus

View file

@ -25,14 +25,6 @@ services:
volumes: volumes:
- ./arch_config.yaml:/app/arch_config.yaml - ./arch_config.yaml:/app/arch_config.yaml
trace_streamer:
build:
context: ../shared/trace_streamer
environment:
- OTEL_TRACING_HTTP_ENDPOINT=http://otel-collector:4318/v1/traces
volumes:
- ~/archgw_logs:/var/log/
prometheus: prometheus:
build: build:
context: ../shared/prometheus context: ../shared/prometheus

View file

@ -29,32 +29,24 @@ cd ../demos/weather_forecast
docker compose up weather_forecast_service --build -d docker compose up weather_forecast_service --build -d
cd - cd -
print_disk_usage
log building and install model server log building and install model server
log ================================= log =================================
cd ../model_server cd ../model_server
poetry install poetry install
cd - cd -
print_disk_usage
log building and installing archgw cli log building and installing archgw cli
log ================================== log ==================================
cd ../arch/tools cd ../arch/tools
sh build_cli.sh sh build_cli.sh
cd - cd -
print_disk_usage
log building docker image for arch gateway log building docker image for arch gateway
log ====================================== log ======================================
cd ../ cd ../
archgw build archgw build
cd - cd -
print_disk_usage
log startup arch gateway with function calling demo log startup arch gateway with function calling demo
cd .. cd ..
tail -F ~/archgw_logs/modelserver.log & tail -F ~/archgw_logs/modelserver.log &
@ -64,8 +56,6 @@ archgw up demos/weather_forecast/arch_config.yaml
kill $model_server_tail_pid kill $model_server_tail_pid
cd - cd -
print_disk_usage
log running e2e tests log running e2e tests
log ================= log =================
poetry install poetry install