mirror of
https://github.com/katanemo/plano.git
synced 2026-05-08 23:32:43 +02:00
Improve end to end tracing (#628)
* adding canonical tracing support via bright-staff * improved formatting for tools in the traces * removing anthropic from the currency exchange demo * using Envoy to transport traces, not calling OTEL directly * moving otel collcetor cluster outside tracing if/else * minor fixes to not write to the OTEL collector if tracing is disabled * fixed PR comments and added more trace attributes * more fixes based on PR comments * more clean up based on PR comments --------- Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-342.local>
This commit is contained in:
parent
8adb9795d8
commit
a79f55f313
34 changed files with 2556 additions and 403 deletions
345
crates/brightstaff/src/handlers/llm.rs
Normal file
345
crates/brightstaff/src/handlers/llm.rs
Normal file
|
|
@ -0,0 +1,345 @@
|
|||
use bytes::Bytes;
|
||||
use common::configuration::{LlmProvider, ModelAlias};
|
||||
use common::consts::{ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER};
|
||||
use common::traces::TraceCollector;
|
||||
use hermesllm::clients::SupportedAPIsFromClient;
|
||||
use hermesllm::{ProviderRequest, ProviderRequestType};
|
||||
use http_body_util::combinators::BoxBody;
|
||||
use http_body_util::{BodyExt, Full};
|
||||
use hyper::header::{self};
|
||||
use hyper::{Request, Response, StatusCode};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
use crate::router::llm_router::RouterService;
|
||||
use crate::handlers::utils::{create_streaming_response, ObservableStreamProcessor, truncate_message};
|
||||
use crate::handlers::router_chat::router_chat_get_upstream_model;
|
||||
use crate::tracing::operation_component;
|
||||
|
||||
fn full<T: Into<Bytes>>(chunk: T) -> BoxBody<Bytes, hyper::Error> {
|
||||
Full::new(chunk.into())
|
||||
.map_err(|never| match never {})
|
||||
.boxed()
|
||||
}
|
||||
|
||||
pub async fn llm_chat(
|
||||
request: Request<hyper::body::Incoming>,
|
||||
router_service: Arc<RouterService>,
|
||||
full_qualified_llm_provider_url: String,
|
||||
model_aliases: Arc<Option<HashMap<String, ModelAlias>>>,
|
||||
llm_providers: Arc<RwLock<Vec<LlmProvider>>>,
|
||||
trace_collector: Arc<TraceCollector>,
|
||||
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
|
||||
|
||||
let request_path = request.uri().path().to_string();
|
||||
let request_headers = request.headers().clone();
|
||||
|
||||
// Extract or generate traceparent - this establishes the trace context for all spans
|
||||
let traceparent: String = request_headers
|
||||
.get("traceparent")
|
||||
.and_then(|h| h.to_str().ok())
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| {
|
||||
use uuid::Uuid;
|
||||
let trace_id = Uuid::new_v4().to_string().replace("-", "");
|
||||
format!("00-{}-0000000000000000-01", trace_id)
|
||||
});
|
||||
|
||||
let mut request_headers = request_headers;
|
||||
let chat_request_bytes = request.collect().await?.to_bytes();
|
||||
|
||||
debug!(
|
||||
"Received request body (raw utf8): {}",
|
||||
String::from_utf8_lossy(&chat_request_bytes)
|
||||
);
|
||||
|
||||
let mut client_request = match ProviderRequestType::try_from((
|
||||
&chat_request_bytes[..],
|
||||
&SupportedAPIsFromClient::from_endpoint(request_path.as_str()).unwrap(),
|
||||
)) {
|
||||
Ok(request) => request,
|
||||
Err(err) => {
|
||||
warn!("Failed to parse request as ProviderRequestType: {}", err);
|
||||
let err_msg = format!("Failed to parse request: {}", err);
|
||||
let mut bad_request = Response::new(full(err_msg));
|
||||
*bad_request.status_mut() = StatusCode::BAD_REQUEST;
|
||||
return Ok(bad_request);
|
||||
}
|
||||
};
|
||||
|
||||
// Model alias resolution: update model field in client_request immediately
|
||||
// This ensures all downstream objects use the resolved model
|
||||
let model_from_request = client_request.model().to_string();
|
||||
let temperature = client_request.get_temperature();
|
||||
let is_streaming_request = client_request.is_streaming();
|
||||
let resolved_model = resolve_model_alias(&model_from_request, &model_aliases);
|
||||
|
||||
// Extract tool names and user message preview for span attributes
|
||||
let tool_names = client_request.get_tool_names();
|
||||
let user_message_preview = client_request.get_recent_user_message()
|
||||
.map(|msg| truncate_message(&msg, 50));
|
||||
|
||||
client_request.set_model(resolved_model.clone());
|
||||
if client_request.remove_metadata_key("archgw_preference_config") {
|
||||
debug!("Removed archgw_preference_config from metadata");
|
||||
}
|
||||
|
||||
let client_request_bytes_for_upstream = ProviderRequestType::to_bytes(&client_request).unwrap();
|
||||
|
||||
// Determine routing using the dedicated router_chat module
|
||||
let routing_result = match router_chat_get_upstream_model(
|
||||
router_service,
|
||||
client_request, // Pass the original request - router_chat will convert it
|
||||
&request_headers,
|
||||
trace_collector.clone(),
|
||||
&traceparent,
|
||||
&request_path,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(result) => result,
|
||||
Err(err) => {
|
||||
let mut internal_error = Response::new(full(err.message));
|
||||
*internal_error.status_mut() = err.status_code;
|
||||
return Ok(internal_error);
|
||||
}
|
||||
};
|
||||
|
||||
let model_name = routing_result.model_name;
|
||||
|
||||
debug!(
|
||||
"[ARCH_ROUTER] URL: {}, Resolved Model: {}",
|
||||
full_qualified_llm_provider_url, model_name
|
||||
);
|
||||
|
||||
request_headers.insert(
|
||||
ARCH_PROVIDER_HINT_HEADER,
|
||||
header::HeaderValue::from_str(&model_name).unwrap(),
|
||||
);
|
||||
|
||||
request_headers.insert(
|
||||
header::HeaderName::from_static(ARCH_IS_STREAMING_HEADER),
|
||||
header::HeaderValue::from_str(&is_streaming_request.to_string()).unwrap(),
|
||||
);
|
||||
// remove content-length header if it exists
|
||||
request_headers.remove(header::CONTENT_LENGTH);
|
||||
|
||||
// Capture start time right before sending request to upstream
|
||||
let request_start_time = std::time::Instant::now();
|
||||
let request_start_system_time = std::time::SystemTime::now();
|
||||
|
||||
let llm_response = match reqwest::Client::new()
|
||||
.post(full_qualified_llm_provider_url)
|
||||
.headers(request_headers)
|
||||
.body(client_request_bytes_for_upstream)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Ok(res) => res,
|
||||
Err(err) => {
|
||||
let err_msg = format!("Failed to send request: {}", err);
|
||||
let mut internal_error = Response::new(full(err_msg));
|
||||
*internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
return Ok(internal_error);
|
||||
}
|
||||
};
|
||||
|
||||
// copy over the headers and status code from the original response
|
||||
let response_headers = llm_response.headers().clone();
|
||||
let upstream_status = llm_response.status();
|
||||
let mut response = Response::builder().status(upstream_status);
|
||||
let headers = response.headers_mut().unwrap();
|
||||
for (header_name, header_value) in response_headers.iter() {
|
||||
headers.insert(header_name, header_value.clone());
|
||||
}
|
||||
|
||||
// Build LLM span with actual status code using constants
|
||||
let byte_stream = llm_response.bytes_stream();
|
||||
|
||||
// Build the LLM span (will be finalized after streaming completes)
|
||||
let llm_span = build_llm_span(
|
||||
&traceparent,
|
||||
&request_path,
|
||||
&resolved_model,
|
||||
&model_name,
|
||||
upstream_status.as_u16(),
|
||||
is_streaming_request,
|
||||
request_start_system_time,
|
||||
tool_names,
|
||||
user_message_preview,
|
||||
temperature,
|
||||
&llm_providers,
|
||||
).await;
|
||||
|
||||
// Use PassthroughProcessor to track streaming metrics and finalize the span
|
||||
let processor = ObservableStreamProcessor::new(
|
||||
trace_collector,
|
||||
operation_component::LLM,
|
||||
llm_span,
|
||||
request_start_time,
|
||||
);
|
||||
|
||||
let streaming_response = create_streaming_response(byte_stream, processor, 16);
|
||||
|
||||
match response.body(streaming_response.body) {
|
||||
Ok(response) => Ok(response),
|
||||
Err(err) => {
|
||||
let err_msg = format!("Failed to create response: {}", err);
|
||||
let mut internal_error = Response::new(full(err_msg));
|
||||
*internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
Ok(internal_error)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolves model aliases by looking up the requested model in the model_aliases map.
|
||||
/// Returns the target model if an alias is found, otherwise returns the original model.
|
||||
fn resolve_model_alias(
|
||||
model_from_request: &str,
|
||||
model_aliases: &Arc<Option<HashMap<String, ModelAlias>>>,
|
||||
) -> String {
|
||||
if let Some(aliases) = model_aliases.as_ref() {
|
||||
if let Some(model_alias) = aliases.get(model_from_request) {
|
||||
debug!(
|
||||
"Model Alias: 'From {}' -> 'To {}'",
|
||||
model_from_request, model_alias.target
|
||||
);
|
||||
return model_alias.target.clone();
|
||||
}
|
||||
}
|
||||
model_from_request.to_string()
|
||||
}
|
||||
|
||||
/// Builds the LLM span with all required and optional attributes.
|
||||
async fn build_llm_span(
|
||||
traceparent: &str,
|
||||
request_path: &str,
|
||||
resolved_model: &str,
|
||||
model_name: &str,
|
||||
status_code: u16,
|
||||
is_streaming: bool,
|
||||
start_time: std::time::SystemTime,
|
||||
tool_names: Option<Vec<String>>,
|
||||
user_message_preview: Option<String>,
|
||||
temperature: Option<f32>,
|
||||
llm_providers: &Arc<RwLock<Vec<LlmProvider>>>,
|
||||
) -> common::traces::Span {
|
||||
use common::traces::{SpanBuilder, SpanKind, parse_traceparent};
|
||||
use crate::tracing::{http, llm, OperationNameBuilder};
|
||||
|
||||
// Calculate the upstream path based on provider configuration
|
||||
let upstream_path = get_upstream_path(
|
||||
llm_providers,
|
||||
model_name,
|
||||
request_path,
|
||||
resolved_model,
|
||||
is_streaming,
|
||||
).await;
|
||||
|
||||
// Build operation name showing path transformation if different
|
||||
let operation_name = if request_path != upstream_path {
|
||||
OperationNameBuilder::new()
|
||||
.with_method("POST")
|
||||
.with_path(&format!("{} >> {}", request_path, upstream_path))
|
||||
.with_target(resolved_model)
|
||||
.build()
|
||||
} else {
|
||||
OperationNameBuilder::new()
|
||||
.with_method("POST")
|
||||
.with_path(request_path)
|
||||
.with_target(resolved_model)
|
||||
.build()
|
||||
};
|
||||
|
||||
let (trace_id, parent_span_id) = parse_traceparent(traceparent);
|
||||
|
||||
let mut span_builder = SpanBuilder::new(&operation_name)
|
||||
.with_trace_id(&trace_id)
|
||||
.with_kind(SpanKind::Client)
|
||||
.with_start_time(start_time)
|
||||
.with_attribute(http::METHOD, "POST")
|
||||
.with_attribute(http::STATUS_CODE, status_code.to_string())
|
||||
.with_attribute(http::TARGET, request_path.to_string())
|
||||
.with_attribute(http::UPSTREAM_TARGET, upstream_path)
|
||||
.with_attribute(llm::MODEL_NAME, resolved_model.to_string())
|
||||
.with_attribute(llm::IS_STREAMING, is_streaming.to_string());
|
||||
|
||||
// Only set parent span ID if it exists (not a root span)
|
||||
if let Some(parent) = parent_span_id {
|
||||
span_builder = span_builder.with_parent_span_id(&parent);
|
||||
}
|
||||
|
||||
// Add optional attributes
|
||||
if let Some(temp) = temperature {
|
||||
span_builder = span_builder.with_attribute(llm::TEMPERATURE, temp.to_string());
|
||||
}
|
||||
|
||||
if let Some(tools) = tool_names {
|
||||
let formatted_tools = tools.iter()
|
||||
.map(|name| format!("{}(...)", name))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
span_builder = span_builder.with_attribute(llm::TOOLS, formatted_tools);
|
||||
}
|
||||
|
||||
if let Some(preview) = user_message_preview {
|
||||
span_builder = span_builder.with_attribute(llm::USER_MESSAGE_PREVIEW, preview);
|
||||
}
|
||||
|
||||
span_builder.build()
|
||||
}
|
||||
|
||||
/// Calculates the upstream path for the provider based on the model name.
|
||||
/// Looks up provider configuration, gets the ProviderId and base_url_path_prefix,
|
||||
/// then uses target_endpoint_for_provider to calculate the correct upstream path.
|
||||
async fn get_upstream_path(
|
||||
llm_providers: &Arc<RwLock<Vec<LlmProvider>>>,
|
||||
model_name: &str,
|
||||
request_path: &str,
|
||||
resolved_model: &str,
|
||||
is_streaming: bool,
|
||||
) -> String {
|
||||
let providers_lock = llm_providers.read().await;
|
||||
|
||||
// First, try to find by model name or provider name
|
||||
let provider = providers_lock.iter().find(|p| {
|
||||
p.model.as_ref().map(|m| m == model_name).unwrap_or(false)
|
||||
|| p.name == model_name
|
||||
});
|
||||
|
||||
let (provider_id, base_url_path_prefix) = if let Some(provider) = provider {
|
||||
let provider_id = provider.provider_interface.to_provider_id();
|
||||
let prefix = provider.base_url_path_prefix.clone();
|
||||
(provider_id, prefix)
|
||||
} else {
|
||||
let default_provider = providers_lock.iter().find(|p| {
|
||||
p.default.unwrap_or(false)
|
||||
});
|
||||
|
||||
if let Some(provider) = default_provider {
|
||||
let provider_id = provider.provider_interface.to_provider_id();
|
||||
let prefix = provider.base_url_path_prefix.clone();
|
||||
(provider_id, prefix)
|
||||
} else {
|
||||
// Last resort: use OpenAI as hardcoded fallback
|
||||
warn!("No default provider found, falling back to OpenAI");
|
||||
(hermesllm::ProviderId::OpenAI, None)
|
||||
}
|
||||
};
|
||||
|
||||
drop(providers_lock);
|
||||
|
||||
// Calculate the upstream path using the proper API
|
||||
let client_api = SupportedAPIsFromClient::from_endpoint(request_path)
|
||||
.expect("Should have valid API endpoint");
|
||||
|
||||
client_api.target_endpoint_for_provider(
|
||||
&provider_id,
|
||||
request_path,
|
||||
resolved_model,
|
||||
is_streaming,
|
||||
base_url_path_prefix.as_deref(),
|
||||
)
|
||||
}
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
pub mod agent_chat_completions;
|
||||
pub mod agent_selector;
|
||||
pub mod router;
|
||||
pub mod llm;
|
||||
pub mod router_chat;
|
||||
pub mod models;
|
||||
pub mod function_calling;
|
||||
pub mod pipeline_processor;
|
||||
|
|
|
|||
|
|
@ -1,252 +0,0 @@
|
|||
use bytes::Bytes;
|
||||
use common::configuration::{ModelAlias, ModelUsagePreference};
|
||||
use common::consts::{ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER};
|
||||
use hermesllm::apis::openai::ChatCompletionsRequest;
|
||||
use hermesllm::clients::endpoints::SupportedUpstreamAPIs;
|
||||
use hermesllm::clients::SupportedAPIsFromClient;
|
||||
use hermesllm::{ProviderRequest, ProviderRequestType};
|
||||
use http_body_util::combinators::BoxBody;
|
||||
use http_body_util::{BodyExt, Full};
|
||||
use hyper::header::{self};
|
||||
use hyper::{Request, Response, StatusCode};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use crate::router::llm_router::RouterService;
|
||||
use crate::handlers::utils::{create_streaming_response, PassthroughProcessor};
|
||||
|
||||
fn full<T: Into<Bytes>>(chunk: T) -> BoxBody<Bytes, hyper::Error> {
|
||||
Full::new(chunk.into())
|
||||
.map_err(|never| match never {})
|
||||
.boxed()
|
||||
}
|
||||
|
||||
pub async fn router_chat(
|
||||
request: Request<hyper::body::Incoming>,
|
||||
router_service: Arc<RouterService>,
|
||||
full_qualified_llm_provider_url: String,
|
||||
model_aliases: Arc<Option<HashMap<String, ModelAlias>>>,
|
||||
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
|
||||
let request_path = request.uri().path().to_string();
|
||||
let mut request_headers = request.headers().clone();
|
||||
let chat_request_bytes = request.collect().await?.to_bytes();
|
||||
|
||||
debug!(
|
||||
"Received request body (raw utf8): {}",
|
||||
String::from_utf8_lossy(&chat_request_bytes)
|
||||
);
|
||||
|
||||
let mut client_request = match ProviderRequestType::try_from((
|
||||
&chat_request_bytes[..],
|
||||
&SupportedAPIsFromClient::from_endpoint(request_path.as_str()).unwrap(),
|
||||
)) {
|
||||
Ok(request) => request,
|
||||
Err(err) => {
|
||||
warn!("Failed to parse request as ProviderRequestType: {}", err);
|
||||
let err_msg = format!("Failed to parse request: {}", err);
|
||||
let mut bad_request = Response::new(full(err_msg));
|
||||
*bad_request.status_mut() = StatusCode::BAD_REQUEST;
|
||||
return Ok(bad_request);
|
||||
}
|
||||
};
|
||||
|
||||
// Model alias resolution: update model field in client_request immediately
|
||||
// This ensures all downstream objects use the resolved model
|
||||
let model_from_request = client_request.model().to_string();
|
||||
let is_streaming_request = client_request.is_streaming();
|
||||
let resolved_model = if let Some(model_aliases) = model_aliases.as_ref() {
|
||||
if let Some(model_alias) = model_aliases.get(&model_from_request) {
|
||||
debug!(
|
||||
"Model Alias: 'From {}' -> 'To {}'",
|
||||
model_from_request, model_alias.target
|
||||
);
|
||||
model_alias.target.clone()
|
||||
} else {
|
||||
model_from_request.clone()
|
||||
}
|
||||
} else {
|
||||
model_from_request.clone()
|
||||
};
|
||||
client_request.set_model(resolved_model.clone());
|
||||
|
||||
// Clone metadata for routing and remove archgw_preference_config from original
|
||||
let routing_metadata = client_request.metadata().clone();
|
||||
|
||||
if client_request.remove_metadata_key("archgw_preference_config") {
|
||||
debug!("Removed archgw_preference_config from metadata");
|
||||
}
|
||||
|
||||
let client_request_bytes_for_upstream = ProviderRequestType::to_bytes(&client_request).unwrap();
|
||||
|
||||
// Convert to ChatCompletionsRequest regardless of input type (clone to avoid moving original)
|
||||
let chat_completions_request_for_arch_router: ChatCompletionsRequest =
|
||||
match ProviderRequestType::try_from((
|
||||
client_request,
|
||||
&SupportedUpstreamAPIs::OpenAIChatCompletions(
|
||||
hermesllm::apis::OpenAIApi::ChatCompletions,
|
||||
),
|
||||
)) {
|
||||
Ok(ProviderRequestType::ChatCompletionsRequest(req)) => req,
|
||||
Ok(
|
||||
ProviderRequestType::MessagesRequest(_)
|
||||
| ProviderRequestType::BedrockConverse(_)
|
||||
| ProviderRequestType::BedrockConverseStream(_)
|
||||
| ProviderRequestType::ResponsesAPIRequest(_),
|
||||
) => {
|
||||
// This should not happen after conversion to OpenAI format
|
||||
warn!("Unexpected: got non-ChatCompletions request after converting to OpenAI format");
|
||||
let err_msg = "Request conversion failed".to_string();
|
||||
let mut bad_request = Response::new(full(err_msg));
|
||||
*bad_request.status_mut() = StatusCode::BAD_REQUEST;
|
||||
return Ok(bad_request);
|
||||
}
|
||||
Err(err) => {
|
||||
warn!(
|
||||
"Failed to convert request to ChatCompletionsRequest: {}",
|
||||
err
|
||||
);
|
||||
let err_msg = format!("Failed to convert request: {}", err);
|
||||
let mut bad_request = Response::new(full(err_msg));
|
||||
*bad_request.status_mut() = StatusCode::BAD_REQUEST;
|
||||
return Ok(bad_request);
|
||||
}
|
||||
};
|
||||
|
||||
debug!(
|
||||
"[ARCH_ROUTER REQ]: {}",
|
||||
&serde_json::to_string(&chat_completions_request_for_arch_router).unwrap()
|
||||
);
|
||||
|
||||
let trace_parent = request_headers
|
||||
.iter()
|
||||
.find(|(ty, _)| ty.as_str() == "traceparent")
|
||||
.map(|(_, value)| value.to_str().unwrap_or_default().to_string());
|
||||
|
||||
let usage_preferences_str: Option<String> = routing_metadata.as_ref().and_then(|metadata| {
|
||||
metadata
|
||||
.get("archgw_preference_config")
|
||||
.map(|value| value.to_string())
|
||||
});
|
||||
|
||||
let usage_preferences: Option<Vec<ModelUsagePreference>> = usage_preferences_str
|
||||
.as_ref()
|
||||
.and_then(|s| serde_yaml::from_str(s).ok());
|
||||
|
||||
let latest_message_for_log = chat_completions_request_for_arch_router
|
||||
.messages
|
||||
.last()
|
||||
.map_or("None".to_string(), |msg| {
|
||||
msg.content.to_string().replace('\n', "\\n")
|
||||
});
|
||||
|
||||
const MAX_MESSAGE_LENGTH: usize = 50;
|
||||
let latest_message_for_log = if latest_message_for_log.chars().count() > MAX_MESSAGE_LENGTH {
|
||||
let truncated: String = latest_message_for_log
|
||||
.chars()
|
||||
.take(MAX_MESSAGE_LENGTH)
|
||||
.collect();
|
||||
format!("{}...", truncated)
|
||||
} else {
|
||||
latest_message_for_log
|
||||
};
|
||||
|
||||
info!(
|
||||
"request received, request type: chat_completion, usage preferences from request: {}, request path: {}, latest message: {}",
|
||||
usage_preferences.is_some(),
|
||||
request_path,
|
||||
latest_message_for_log
|
||||
);
|
||||
|
||||
debug!("usage preferences from request: {:?}", usage_preferences);
|
||||
|
||||
let model_name = match router_service
|
||||
.determine_route(
|
||||
&chat_completions_request_for_arch_router.messages,
|
||||
trace_parent.clone(),
|
||||
usage_preferences,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(route) => match route {
|
||||
Some((_, model_name)) => model_name,
|
||||
None => {
|
||||
info!(
|
||||
"No route determined, using default model from request: {}",
|
||||
chat_completions_request_for_arch_router.model
|
||||
);
|
||||
chat_completions_request_for_arch_router.model.clone()
|
||||
}
|
||||
},
|
||||
Err(err) => {
|
||||
let err_msg = format!("Failed to determine route: {}", err);
|
||||
let mut internal_error = Response::new(full(err_msg));
|
||||
*internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
return Ok(internal_error);
|
||||
}
|
||||
};
|
||||
|
||||
debug!(
|
||||
"[ARCH_ROUTER] URL: {}, Resolved Model: {}",
|
||||
full_qualified_llm_provider_url, model_name
|
||||
);
|
||||
|
||||
request_headers.insert(
|
||||
ARCH_PROVIDER_HINT_HEADER,
|
||||
header::HeaderValue::from_str(&model_name).unwrap(),
|
||||
);
|
||||
|
||||
request_headers.insert(
|
||||
header::HeaderName::from_static(ARCH_IS_STREAMING_HEADER),
|
||||
header::HeaderValue::from_str(&is_streaming_request.to_string()).unwrap(),
|
||||
);
|
||||
|
||||
if let Some(trace_parent) = trace_parent {
|
||||
request_headers.insert(
|
||||
header::HeaderName::from_static("traceparent"),
|
||||
header::HeaderValue::from_str(&trace_parent).unwrap(),
|
||||
);
|
||||
}
|
||||
// remove content-length header if it exists
|
||||
request_headers.remove(header::CONTENT_LENGTH);
|
||||
|
||||
let llm_response = match reqwest::Client::new()
|
||||
.post(full_qualified_llm_provider_url)
|
||||
.headers(request_headers)
|
||||
.body(client_request_bytes_for_upstream)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Ok(res) => res,
|
||||
Err(err) => {
|
||||
let err_msg = format!("Failed to send request: {}", err);
|
||||
let mut internal_error = Response::new(full(err_msg));
|
||||
*internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
return Ok(internal_error);
|
||||
}
|
||||
};
|
||||
|
||||
// copy over the headers and status code from the original response
|
||||
let response_headers = llm_response.headers().clone();
|
||||
let upstream_status = llm_response.status();
|
||||
let mut response = Response::builder().status(upstream_status);
|
||||
let headers = response.headers_mut().unwrap();
|
||||
for (header_name, header_value) in response_headers.iter() {
|
||||
headers.insert(header_name, header_value.clone());
|
||||
}
|
||||
|
||||
// Use the streaming utility with a passthrough processor (no modification of chunks)
|
||||
let byte_stream = llm_response.bytes_stream();
|
||||
let processor = PassthroughProcessor;
|
||||
let streaming_response = create_streaming_response(byte_stream, processor, 16);
|
||||
|
||||
match response.body(streaming_response.body) {
|
||||
Ok(response) => Ok(response),
|
||||
Err(err) => {
|
||||
let err_msg = format!("Failed to create response: {}", err);
|
||||
let mut internal_error = Response::new(full(err_msg));
|
||||
*internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
Ok(internal_error)
|
||||
}
|
||||
}
|
||||
}
|
||||
243
crates/brightstaff/src/handlers/router_chat.rs
Normal file
243
crates/brightstaff/src/handlers/router_chat.rs
Normal file
|
|
@ -0,0 +1,243 @@
|
|||
use common::configuration::ModelUsagePreference;
|
||||
use common::traces::{TraceCollector, SpanKind, SpanBuilder, parse_traceparent};
|
||||
use hermesllm::clients::endpoints::SupportedUpstreamAPIs;
|
||||
use hermesllm::{ProviderRequest, ProviderRequestType};
|
||||
use hyper::StatusCode;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use crate::router::llm_router::RouterService;
|
||||
use crate::tracing::{OperationNameBuilder, operation_component, http, routing};
|
||||
|
||||
pub struct RoutingResult {
|
||||
pub model_name: String
|
||||
}
|
||||
|
||||
pub struct RoutingError {
|
||||
pub message: String,
|
||||
pub status_code: StatusCode,
|
||||
}
|
||||
|
||||
impl RoutingError {
|
||||
pub fn internal_error(message: String) -> Self {
|
||||
Self {
|
||||
message,
|
||||
status_code: StatusCode::INTERNAL_SERVER_ERROR
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Determines the routing decision if
|
||||
///
|
||||
/// # Returns
|
||||
/// * `Ok(RoutingResult)` - Contains the selected model name and span ID
|
||||
/// * `Err(RoutingError)` - Contains error details and optional span ID
|
||||
pub async fn router_chat_get_upstream_model(
|
||||
router_service: Arc<RouterService>,
|
||||
client_request: ProviderRequestType,
|
||||
request_headers: &hyper::HeaderMap,
|
||||
trace_collector: Arc<TraceCollector>,
|
||||
traceparent: &str,
|
||||
request_path: &str,
|
||||
) -> Result<RoutingResult, RoutingError> {
|
||||
// Clone metadata for routing before converting (which consumes client_request)
|
||||
let routing_metadata = client_request.metadata().clone();
|
||||
|
||||
// Convert to ChatCompletionsRequest for routing (regardless of input type)
|
||||
let chat_request = match ProviderRequestType::try_from((
|
||||
client_request,
|
||||
&SupportedUpstreamAPIs::OpenAIChatCompletions(
|
||||
hermesllm::apis::OpenAIApi::ChatCompletions,
|
||||
),
|
||||
)) {
|
||||
Ok(ProviderRequestType::ChatCompletionsRequest(req)) => req,
|
||||
Ok(
|
||||
ProviderRequestType::MessagesRequest(_)
|
||||
| ProviderRequestType::BedrockConverse(_)
|
||||
| ProviderRequestType::BedrockConverseStream(_)
|
||||
| ProviderRequestType::ResponsesAPIRequest(_),
|
||||
) => {
|
||||
warn!("Unexpected: got non-ChatCompletions request after converting to OpenAI format");
|
||||
return Err(RoutingError::internal_error(
|
||||
"Request conversion failed".to_string(),
|
||||
));
|
||||
}
|
||||
Err(err) => {
|
||||
warn!("Failed to convert request to ChatCompletionsRequest: {}", err);
|
||||
return Err(RoutingError::internal_error(format!(
|
||||
"Failed to convert request: {}",
|
||||
err
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
debug!(
|
||||
"[ARCH_ROUTER REQ]: {}",
|
||||
&serde_json::to_string(&chat_request).unwrap()
|
||||
);
|
||||
|
||||
// Extract trace_parent from headers
|
||||
let trace_parent = request_headers
|
||||
.iter()
|
||||
.find(|(ty, _)| ty.as_str() == "traceparent")
|
||||
.map(|(_, value)| value.to_str().unwrap_or_default().to_string());
|
||||
|
||||
// Extract usage preferences from metadata
|
||||
let usage_preferences_str: Option<String> = routing_metadata.as_ref().and_then(|metadata| {
|
||||
metadata
|
||||
.get("archgw_preference_config")
|
||||
.map(|value| value.to_string())
|
||||
});
|
||||
|
||||
let usage_preferences: Option<Vec<ModelUsagePreference>> = usage_preferences_str
|
||||
.as_ref()
|
||||
.and_then(|s| serde_yaml::from_str(s).ok());
|
||||
|
||||
// Prepare log message with latest message from chat request
|
||||
let latest_message_for_log = chat_request
|
||||
.messages
|
||||
.last()
|
||||
.map_or("None".to_string(), |msg| {
|
||||
msg.content.to_string().replace('\n', "\\n")
|
||||
});
|
||||
|
||||
const MAX_MESSAGE_LENGTH: usize = 50;
|
||||
let latest_message_for_log = if latest_message_for_log.chars().count() > MAX_MESSAGE_LENGTH {
|
||||
let truncated: String = latest_message_for_log
|
||||
.chars()
|
||||
.take(MAX_MESSAGE_LENGTH)
|
||||
.collect();
|
||||
format!("{}...", truncated)
|
||||
} else {
|
||||
latest_message_for_log
|
||||
};
|
||||
|
||||
info!(
|
||||
"request received, request type: chat_completion, usage preferences from request: {}, request path: {}, latest message: {}",
|
||||
usage_preferences.is_some(),
|
||||
request_path,
|
||||
latest_message_for_log
|
||||
);
|
||||
|
||||
debug!("usage preferences from request: {:?}", usage_preferences);
|
||||
|
||||
// Capture start time for routing span
|
||||
let routing_start_time = std::time::Instant::now();
|
||||
let routing_start_system_time = std::time::SystemTime::now();
|
||||
|
||||
// Attempt to determine route using the router service
|
||||
let routing_result = router_service
|
||||
.determine_route(&chat_request.messages, trace_parent, usage_preferences)
|
||||
.await;
|
||||
|
||||
match routing_result {
|
||||
Ok(route) => match route {
|
||||
Some((_, model_name)) => {
|
||||
// Record successful routing span
|
||||
let mut attrs: HashMap<String, String> = HashMap::new();
|
||||
attrs.insert("route.selected_model".to_string(), model_name.clone());
|
||||
record_routing_span(
|
||||
trace_collector,
|
||||
traceparent,
|
||||
routing_start_time,
|
||||
routing_start_system_time,
|
||||
attrs,
|
||||
)
|
||||
.await;
|
||||
|
||||
Ok(RoutingResult {
|
||||
model_name
|
||||
})
|
||||
}
|
||||
None => {
|
||||
// No route determined, use default model from request
|
||||
info!(
|
||||
"No route determined, using default model from request: {}",
|
||||
chat_request.model
|
||||
);
|
||||
|
||||
let default_model = chat_request.model.clone();
|
||||
let mut attrs = HashMap::new();
|
||||
attrs.insert("route.selected_model".to_string(), default_model.clone());
|
||||
record_routing_span(
|
||||
trace_collector,
|
||||
traceparent,
|
||||
routing_start_time,
|
||||
routing_start_system_time,
|
||||
attrs,
|
||||
)
|
||||
.await;
|
||||
|
||||
Ok(RoutingResult {
|
||||
model_name: default_model
|
||||
})
|
||||
}
|
||||
},
|
||||
Err(err) => {
|
||||
// Record failed routing span
|
||||
let mut attrs = HashMap::new();
|
||||
attrs.insert("route.selected_model".to_string(), "unknown".to_string());
|
||||
attrs.insert("error.message".to_string(), err.to_string());
|
||||
record_routing_span(
|
||||
trace_collector,
|
||||
traceparent,
|
||||
routing_start_time,
|
||||
routing_start_system_time,
|
||||
attrs,
|
||||
)
|
||||
.await;
|
||||
|
||||
Err(RoutingError::internal_error(
|
||||
format!("Failed to determine route: {}", err)
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to record a routing span with the given attributes.
|
||||
/// Reduces code duplication across different routing outcomes.
|
||||
async fn record_routing_span(
|
||||
trace_collector: Arc<TraceCollector>,
|
||||
traceparent: &str,
|
||||
start_time: std::time::Instant,
|
||||
start_system_time: std::time::SystemTime,
|
||||
attrs: HashMap<String, String>,
|
||||
) {
|
||||
// The routing always uses OpenAI Chat Completions format internally,
|
||||
// so we log that as the actual API being used for routing
|
||||
let routing_api_path = "/v1/chat/completions";
|
||||
|
||||
let routing_operation_name = OperationNameBuilder::new()
|
||||
.with_method("POST")
|
||||
.with_path(routing_api_path)
|
||||
.with_target("Arch-Router-1.5B")
|
||||
.build();
|
||||
|
||||
let (trace_id, parent_span_id) = parse_traceparent(traceparent);
|
||||
|
||||
// Build the routing span directly using constants
|
||||
let mut span_builder = SpanBuilder::new(&routing_operation_name)
|
||||
.with_trace_id(&trace_id)
|
||||
.with_kind(SpanKind::Client)
|
||||
.with_start_time(start_system_time)
|
||||
.with_end_time(std::time::SystemTime::now())
|
||||
.with_attribute(http::METHOD, "POST")
|
||||
.with_attribute(http::TARGET, routing_api_path.to_string())
|
||||
.with_attribute(routing::ROUTE_DETERMINATION_MS, start_time.elapsed().as_millis().to_string());
|
||||
|
||||
// Only set parent span ID if it exists (not a root span)
|
||||
if let Some(parent) = parent_span_id {
|
||||
span_builder = span_builder.with_parent_span_id(&parent);
|
||||
}
|
||||
|
||||
// Add all custom attributes
|
||||
for (key, value) in attrs {
|
||||
span_builder = span_builder.with_attribute(key, value);
|
||||
}
|
||||
|
||||
let span = span_builder.build();
|
||||
|
||||
// Record the span directly to the collector
|
||||
trace_collector.record_span(operation_component::ROUTING, span);
|
||||
}
|
||||
|
|
@ -1,18 +1,27 @@
|
|||
use bytes::Bytes;
|
||||
use common::traces::{Span, Attribute, AttributeValue, TraceCollector, Event};
|
||||
use http_body_util::combinators::BoxBody;
|
||||
use http_body_util::StreamBody;
|
||||
use hyper::body::Frame;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Instant, SystemTime};
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
use tokio_stream::StreamExt;
|
||||
use tracing::warn;
|
||||
|
||||
// Import tracing constants
|
||||
use crate::tracing::{llm, error};
|
||||
|
||||
/// Trait for processing streaming chunks
|
||||
/// Implementors can inject custom logic during streaming (e.g., hallucination detection, logging)
|
||||
pub trait StreamProcessor: Send + 'static {
|
||||
/// Process an incoming chunk of bytes
|
||||
fn process_chunk(&mut self, chunk: Bytes) -> Result<Option<Bytes>, String>;
|
||||
|
||||
/// Called when the first bytes are received (for time-to-first-token tracking)
|
||||
fn on_first_bytes(&mut self) {}
|
||||
|
||||
/// Called when streaming completes successfully
|
||||
fn on_complete(&mut self) {}
|
||||
|
||||
|
|
@ -20,13 +29,152 @@ pub trait StreamProcessor: Send + 'static {
|
|||
fn on_error(&mut self, _error: &str) {}
|
||||
}
|
||||
|
||||
/// A no-op processor that just forwards chunks as-is
|
||||
pub struct PassthroughProcessor;
|
||||
/// A processor that tracks streaming metrics and finalizes the span
|
||||
pub struct ObservableStreamProcessor {
|
||||
collector: Arc<TraceCollector>,
|
||||
service_name: String,
|
||||
span: Span,
|
||||
total_bytes: usize,
|
||||
chunk_count: usize,
|
||||
start_time: Instant,
|
||||
time_to_first_token: Option<u128>,
|
||||
}
|
||||
|
||||
impl StreamProcessor for PassthroughProcessor {
|
||||
impl ObservableStreamProcessor {
|
||||
/// Create a new passthrough processor
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `collector` - The trace collector to record the span to
|
||||
/// * `service_name` - The service name for this span (e.g., "archgw(llm)")
|
||||
/// * `span` - The span to finalize after streaming completes
|
||||
/// * `start_time` - When the request started (for duration calculation)
|
||||
pub fn new(
|
||||
collector: Arc<TraceCollector>,
|
||||
service_name: impl Into<String>,
|
||||
span: Span,
|
||||
start_time: Instant,
|
||||
) -> Self {
|
||||
Self {
|
||||
collector,
|
||||
service_name: service_name.into(),
|
||||
span,
|
||||
total_bytes: 0,
|
||||
chunk_count: 0,
|
||||
start_time,
|
||||
time_to_first_token: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl StreamProcessor for ObservableStreamProcessor {
|
||||
fn process_chunk(&mut self, chunk: Bytes) -> Result<Option<Bytes>, String> {
|
||||
self.total_bytes += chunk.len();
|
||||
self.chunk_count += 1;
|
||||
Ok(Some(chunk))
|
||||
}
|
||||
|
||||
fn on_first_bytes(&mut self) {
|
||||
// Record time to first token (only for streaming)
|
||||
if self.time_to_first_token.is_none() {
|
||||
self.time_to_first_token = Some(self.start_time.elapsed().as_millis());
|
||||
}
|
||||
}
|
||||
|
||||
fn on_complete(&mut self) {
|
||||
// Update span with streaming metrics and end time
|
||||
let end_time_nanos = SystemTime::now()
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_nanos();
|
||||
|
||||
self.span.end_time_unix_nano = format!("{}", end_time_nanos);
|
||||
|
||||
// Add streaming metrics as attributes using constants
|
||||
self.span.attributes.push(Attribute {
|
||||
key: llm::RESPONSE_BYTES.to_string(),
|
||||
value: AttributeValue {
|
||||
string_value: Some(self.total_bytes.to_string()),
|
||||
},
|
||||
});
|
||||
|
||||
|
||||
self.span.attributes.push(Attribute {
|
||||
key: llm::DURATION_MS.to_string(),
|
||||
value: AttributeValue {
|
||||
string_value: Some(self.start_time.elapsed().as_millis().to_string()),
|
||||
},
|
||||
});
|
||||
|
||||
// Add time to first token if available (streaming only)
|
||||
if let Some(ttft) = self.time_to_first_token {
|
||||
self.span.attributes.push(Attribute {
|
||||
key: llm::TIME_TO_FIRST_TOKEN_MS.to_string(),
|
||||
value: AttributeValue {
|
||||
string_value: Some(ttft.to_string()),
|
||||
},
|
||||
});
|
||||
|
||||
// Add time to first token as a span event
|
||||
// Calculate the timestamp by adding ttft duration to span start time
|
||||
if let Ok(start_time_nanos) = self.span.start_time_unix_nano.parse::<u128>() {
|
||||
// Convert ttft from milliseconds to nanoseconds and add to start time
|
||||
let event_timestamp = start_time_nanos + (ttft * 1_000_000);
|
||||
let mut event = Event::new(llm::TIME_TO_FIRST_TOKEN_MS.to_string(), event_timestamp);
|
||||
event.add_attribute(
|
||||
llm::TIME_TO_FIRST_TOKEN_MS.to_string(),
|
||||
ttft.to_string(),
|
||||
);
|
||||
|
||||
// Initialize events vector if needed
|
||||
if self.span.events.is_none() {
|
||||
self.span.events = Some(Vec::new());
|
||||
}
|
||||
|
||||
if let Some(ref mut events) = self.span.events {
|
||||
events.push(event);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Record the finalized span
|
||||
self.collector.record_span(&self.service_name, self.span.clone());
|
||||
}
|
||||
|
||||
fn on_error(&mut self, error_msg: &str) {
|
||||
warn!("Stream error in PassthroughProcessor: {}", error_msg);
|
||||
|
||||
// Update span with error info and end time
|
||||
let end_time_nanos = SystemTime::now()
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_nanos();
|
||||
|
||||
self.span.end_time_unix_nano = format!("{}", end_time_nanos);
|
||||
|
||||
self.span.attributes.push(Attribute {
|
||||
key: error::ERROR.to_string(),
|
||||
value: AttributeValue {
|
||||
string_value: Some("true".to_string()),
|
||||
},
|
||||
});
|
||||
|
||||
self.span.attributes.push(Attribute {
|
||||
key: error::MESSAGE.to_string(),
|
||||
value: AttributeValue {
|
||||
string_value: Some(error_msg.to_string()),
|
||||
},
|
||||
});
|
||||
|
||||
self.span.attributes.push(Attribute {
|
||||
key: llm::DURATION_MS.to_string(),
|
||||
value: AttributeValue {
|
||||
string_value: Some(self.start_time.elapsed().as_millis().to_string()),
|
||||
},
|
||||
});
|
||||
|
||||
// Record the error span
|
||||
self.collector.record_span(&self.service_name, self.span.clone());
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of creating a streaming response
|
||||
|
|
@ -48,6 +196,8 @@ where
|
|||
|
||||
// Spawn a task to process and forward chunks
|
||||
let processor_handle = tokio::spawn(async move {
|
||||
let mut is_first_chunk = true;
|
||||
|
||||
while let Some(item) = byte_stream.next().await {
|
||||
let chunk = match item {
|
||||
Ok(chunk) => chunk,
|
||||
|
|
@ -59,6 +209,12 @@ where
|
|||
}
|
||||
};
|
||||
|
||||
// Call on_first_bytes for the first chunk
|
||||
if is_first_chunk {
|
||||
processor.on_first_bytes();
|
||||
is_first_chunk = false;
|
||||
}
|
||||
|
||||
// Process the chunk
|
||||
match processor.process_chunk(chunk) {
|
||||
Ok(Some(processed_chunk)) => {
|
||||
|
|
@ -91,3 +247,13 @@ where
|
|||
processor_handle,
|
||||
}
|
||||
}
|
||||
|
||||
/// Truncates a message to the specified maximum length, adding "..." if truncated.
|
||||
pub fn truncate_message(message: &str, max_length: usize) -> String {
|
||||
if message.chars().count() > max_length {
|
||||
let truncated: String = message.chars().take(max_length).collect();
|
||||
format!("{}...", truncated)
|
||||
} else {
|
||||
message.to_string()
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
pub mod handlers;
|
||||
pub mod router;
|
||||
pub mod tracing;
|
||||
pub mod utils;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
use brightstaff::handlers::agent_chat_completions::agent_chat;
|
||||
use brightstaff::handlers::router::router_chat;
|
||||
use brightstaff::handlers::llm::llm_chat;
|
||||
use brightstaff::handlers::models::list_models;
|
||||
use brightstaff::handlers::function_calling::{function_calling_chat_handler};
|
||||
use brightstaff::router::llm_router::RouterService;
|
||||
|
|
@ -7,6 +7,7 @@ use brightstaff::utils::tracing::init_tracer;
|
|||
use bytes::Bytes;
|
||||
use common::configuration::Configuration;
|
||||
use common::consts::{CHAT_COMPLETIONS_PATH, MESSAGES_PATH, OPENAI_RESPONSES_API_PATH};
|
||||
use common::traces::TraceCollector;
|
||||
use http_body_util::{combinators::BoxBody, BodyExt, Empty};
|
||||
use hyper::body::Incoming;
|
||||
use hyper::server::conn::http1;
|
||||
|
|
@ -46,10 +47,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|||
let _tracer_provider = init_tracer();
|
||||
let bind_address = env::var("BIND_ADDRESS").unwrap_or_else(|_| BIND_ADDRESS.to_string());
|
||||
|
||||
info!(
|
||||
"current working directory: {}",
|
||||
env::current_dir().unwrap().display()
|
||||
);
|
||||
// loading arch_config.yaml file
|
||||
let arch_config_path = env::var("ARCH_CONFIG_PATH_RENDERED")
|
||||
.unwrap_or_else(|_| "./arch_config_rendered.yaml".to_string());
|
||||
|
|
@ -66,19 +63,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|||
let llm_providers = Arc::new(RwLock::new(arch_config.model_providers.clone()));
|
||||
let agents_list = Arc::new(RwLock::new(arch_config.agents.clone()));
|
||||
let listeners = Arc::new(RwLock::new(arch_config.listeners.clone()));
|
||||
|
||||
debug!(
|
||||
"arch_config: {:?}",
|
||||
&serde_json::to_string(arch_config.as_ref()).unwrap()
|
||||
);
|
||||
|
||||
let llm_provider_url =
|
||||
env::var("LLM_PROVIDER_ENDPOINT").unwrap_or_else(|_| "http://localhost:12001".to_string());
|
||||
|
||||
info!("llm provider url: {}", llm_provider_url);
|
||||
info!("listening on http://{}", bind_address);
|
||||
let listener = TcpListener::bind(bind_address).await?;
|
||||
|
||||
let routing_model_name: String = arch_config
|
||||
.routing
|
||||
.as_ref()
|
||||
|
|
@ -100,18 +88,33 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|||
|
||||
let model_aliases = Arc::new(arch_config.model_aliases.clone());
|
||||
|
||||
// Initialize trace collector and start background flusher
|
||||
// Tracing is enabled if the tracing config is present in arch_config.yaml
|
||||
// Pass Some(true/false) to override, or None to use env var OTEL_TRACING_ENABLED
|
||||
let tracing_enabled = if arch_config.tracing.is_some() {
|
||||
info!("Tracing configuration found in arch_config.yaml");
|
||||
Some(true)
|
||||
} else {
|
||||
info!("No tracing configuration in arch_config.yaml, will check OTEL_TRACING_ENABLED env var");
|
||||
None
|
||||
};
|
||||
let trace_collector = Arc::new(TraceCollector::new(tracing_enabled));
|
||||
let _flusher_handle = trace_collector.clone().start_background_flusher();
|
||||
|
||||
|
||||
loop {
|
||||
let (stream, _) = listener.accept().await?;
|
||||
let peer_addr = stream.peer_addr()?;
|
||||
let io = TokioIo::new(stream);
|
||||
|
||||
let router_service: Arc<RouterService> = Arc::clone(&router_service);
|
||||
let model_aliases = Arc::clone(&model_aliases);
|
||||
let model_aliases: Arc<Option<std::collections::HashMap<String, common::configuration::ModelAlias>>> = Arc::clone(&model_aliases);
|
||||
let llm_provider_url = llm_provider_url.clone();
|
||||
|
||||
let llm_providers = llm_providers.clone();
|
||||
let agents_list = agents_list.clone();
|
||||
let listeners = listeners.clone();
|
||||
let trace_collector = trace_collector.clone();
|
||||
let service = service_fn(move |req| {
|
||||
let router_service = Arc::clone(&router_service);
|
||||
let parent_cx = extract_context_from_request(&req);
|
||||
|
|
@ -120,13 +123,14 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|||
let model_aliases = Arc::clone(&model_aliases);
|
||||
let agents_list = agents_list.clone();
|
||||
let listeners = listeners.clone();
|
||||
let trace_collector = trace_collector.clone();
|
||||
|
||||
async move {
|
||||
match (req.method(), req.uri().path()) {
|
||||
(&Method::POST, CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH) => {
|
||||
let fully_qualified_url =
|
||||
format!("{}{}", llm_provider_url, req.uri().path());
|
||||
router_chat(req, router_service, fully_qualified_url, model_aliases)
|
||||
llm_chat(req, router_service, fully_qualified_url, model_aliases, llm_providers, trace_collector)
|
||||
.with_context(parent_cx)
|
||||
.await
|
||||
}
|
||||
|
|
|
|||
319
crates/brightstaff/src/tracing/constants.rs
Normal file
319
crates/brightstaff/src/tracing/constants.rs
Normal file
|
|
@ -0,0 +1,319 @@
|
|||
/// OpenTelemetry Semantic Conventions
|
||||
///
|
||||
/// This module defines standard attribute keys following OTEL semantic conventions.
|
||||
/// See: https://opentelemetry.io/docs/specs/semconv/
|
||||
|
||||
// =============================================================================
|
||||
// Span Attributes - HTTP
|
||||
// =============================================================================
|
||||
|
||||
/// Semantic conventions for HTTP-related span attributes
|
||||
pub mod http {
|
||||
/// HTTP request method
|
||||
/// Example: "GET", "POST", "PUT"
|
||||
pub const METHOD: &str = "http.method";
|
||||
|
||||
/// HTTP response status code
|
||||
/// Example: "200", "404", "500"
|
||||
pub const STATUS_CODE: &str = "http.status_code";
|
||||
|
||||
/// Full HTTP request URL
|
||||
pub const URL: &str = "http.url";
|
||||
|
||||
/// HTTP request target (path + query)
|
||||
/// Example: "/v1/chat/completions?stream=true"
|
||||
pub const TARGET: &str = "http.target";
|
||||
|
||||
/// Upstream target path after routing transformation
|
||||
/// Example: "/api/paas/v4/chat/completions" (for Zhipu provider)
|
||||
pub const UPSTREAM_TARGET: &str = "http.upstream_target";
|
||||
|
||||
/// HTTP request scheme
|
||||
/// Example: "http", "https"
|
||||
pub const SCHEME: &str = "http.scheme";
|
||||
|
||||
/// Value of the HTTP User-Agent header
|
||||
pub const USER_AGENT: &str = "http.user_agent";
|
||||
|
||||
/// Size of the request payload body in bytes
|
||||
pub const REQUEST_CONTENT_LENGTH: &str = "http.request_content_length";
|
||||
|
||||
/// Size of the response payload body in bytes
|
||||
pub const RESPONSE_CONTENT_LENGTH: &str = "http.response_content_length";
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Span Attributes - LLM Specific
|
||||
// =============================================================================
|
||||
|
||||
/// Custom attributes for LLM operations
|
||||
/// These follow the emerging OTEL GenAI semantic conventions
|
||||
pub mod llm {
|
||||
/// Name of the LLM model being called
|
||||
/// Example: "gpt-4", "claude-3-sonnet", "llama-2-70b"
|
||||
pub const MODEL_NAME: &str = "llm.model";
|
||||
|
||||
/// Provider of the LLM
|
||||
/// Example: "openai", "anthropic", "azure-openai"
|
||||
pub const PROVIDER: &str = "llm.provider";
|
||||
|
||||
/// Type of LLM operation
|
||||
/// Example: "chat", "completion", "embedding"
|
||||
pub const OPERATION_TYPE: &str = "llm.operation_type";
|
||||
|
||||
/// Whether the request is streaming
|
||||
pub const IS_STREAMING: &str = "llm.is_streaming";
|
||||
|
||||
/// Total bytes received in the response
|
||||
pub const RESPONSE_BYTES: &str = "llm.response_bytes";
|
||||
|
||||
/// Duration of the LLM call in milliseconds
|
||||
pub const DURATION_MS: &str = "llm.duration_ms";
|
||||
|
||||
/// Time to first token in milliseconds (streaming only)
|
||||
pub const TIME_TO_FIRST_TOKEN_MS: &str = "llm.time_to_first_token";
|
||||
|
||||
/// Number of prompt tokens used
|
||||
pub const PROMPT_TOKENS: &str = "llm.usage.prompt_tokens";
|
||||
|
||||
/// Number of completion tokens generated
|
||||
pub const COMPLETION_TOKENS: &str = "llm.usage.completion_tokens";
|
||||
|
||||
/// Total tokens used (prompt + completion)
|
||||
pub const TOTAL_TOKENS: &str = "llm.usage.total_tokens";
|
||||
|
||||
/// Temperature parameter used
|
||||
pub const TEMPERATURE: &str = "llm.temperature";
|
||||
|
||||
/// Max tokens parameter used
|
||||
pub const MAX_TOKENS: &str = "llm.max_tokens";
|
||||
|
||||
/// Top-p parameter used
|
||||
pub const TOP_P: &str = "llm.top_p";
|
||||
|
||||
/// List of tool names provided in the request
|
||||
pub const TOOLS: &str = "llm.tools";
|
||||
|
||||
/// Preview of the user message (truncated)
|
||||
pub const USER_MESSAGE_PREVIEW: &str = "llm.user_message_preview";
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Span Attributes - Routing & Gateway
|
||||
// =============================================================================
|
||||
|
||||
/// Attributes specific to LLM routing and gateway operations
|
||||
pub mod routing {
|
||||
/// Strategy used to select the LLM endpoint
|
||||
/// Example: "round-robin", "least-latency", "cost-optimized"
|
||||
pub const STRATEGY: &str = "routing.strategy";
|
||||
|
||||
/// Selected upstream endpoint
|
||||
pub const UPSTREAM_ENDPOINT: &str = "routing.upstream_endpoint";
|
||||
|
||||
/// Time taken to determine the route in milliseconds
|
||||
pub const ROUTE_DETERMINATION_MS: &str = "routing.determination_ms";
|
||||
|
||||
/// Whether a fallback endpoint was used
|
||||
pub const IS_FALLBACK: &str = "routing.is_fallback";
|
||||
|
||||
/// Reason for route selection
|
||||
pub const SELECTION_REASON: &str = "routing.selection_reason";
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Span Attributes - Error Handling
|
||||
// =============================================================================
|
||||
|
||||
/// Attributes for error and exception tracking
|
||||
pub mod error {
|
||||
/// Whether an error occurred
|
||||
pub const ERROR: &str = "error";
|
||||
|
||||
/// Type/class of the error
|
||||
/// Example: "TimeoutError", "AuthenticationError"
|
||||
pub const TYPE: &str = "error.type";
|
||||
|
||||
/// Error message
|
||||
pub const MESSAGE: &str = "error.message";
|
||||
|
||||
/// Stack trace of the error
|
||||
pub const STACK_TRACE: &str = "error.stack_trace";
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Operation Names
|
||||
// =============================================================================
|
||||
|
||||
/// Canonical operation name components for Arch Gateway
|
||||
pub mod operation_component {
|
||||
/// Inbound request handling
|
||||
pub const INBOUND: &str = "plano(inbound)";
|
||||
|
||||
/// Routing decision phase
|
||||
pub const ROUTING: &str = "plano(routing)";
|
||||
|
||||
/// Handoff to upstream service
|
||||
pub const HANDOFF: &str = "plano(handoff)";
|
||||
|
||||
/// Agent filter execution
|
||||
pub const AGENT_FILTER: &str = "plano(agent filter)";
|
||||
|
||||
/// Agent execution
|
||||
pub const AGENT: &str = "plano(agent)";
|
||||
|
||||
/// LLM call
|
||||
pub const LLM: &str = "plano(llm)";
|
||||
}
|
||||
|
||||
/// Builder for constructing standardized operation names
|
||||
///
|
||||
/// Format: `{method} {path} {target}`
|
||||
///
|
||||
/// The operation component (e.g., "archgw(llm)") is now part of the service name,
|
||||
/// so the operation name focuses on the HTTP request details and target.
|
||||
///
|
||||
/// # Examples
|
||||
/// ```
|
||||
/// use brightstaff::tracing::OperationNameBuilder;
|
||||
///
|
||||
/// // LLM call operation: "POST /v1/chat/completions gpt-4"
|
||||
/// // (service name will be "archgw(llm)")
|
||||
/// let op = OperationNameBuilder::new()
|
||||
/// .with_method("POST")
|
||||
/// .with_path("/v1/chat/completions")
|
||||
/// .with_target("gpt-4")
|
||||
/// .build();
|
||||
///
|
||||
/// // Agent filter operation: "POST /agents/v1/chat/completions hallucination-detector"
|
||||
/// // (service name will be "archgw(agent filter)")
|
||||
/// let op = OperationNameBuilder::new()
|
||||
/// .with_method("POST")
|
||||
/// .with_path("/agents/v1/chat/completions")
|
||||
/// .with_target("hallucination-detector")
|
||||
/// .build();
|
||||
///
|
||||
/// // Routing operation: "POST /v1/chat/completions"
|
||||
/// // (service name will be "archgw(routing)")
|
||||
/// let op = OperationNameBuilder::new()
|
||||
/// .with_method("POST")
|
||||
/// .with_path("/v1/chat/completions")
|
||||
/// .build();
|
||||
/// ```
|
||||
pub struct OperationNameBuilder {
|
||||
method: Option<String>,
|
||||
path: Option<String>,
|
||||
target: Option<String>,
|
||||
}
|
||||
|
||||
impl OperationNameBuilder {
|
||||
/// Create a new operation name builder
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
method: None,
|
||||
path: None,
|
||||
target: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the HTTP method
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `method` - HTTP method (e.g., "GET", "POST", "PUT")
|
||||
pub fn with_method(mut self, method: impl Into<String>) -> Self {
|
||||
self.method = Some(method.into());
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the request path
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `path` - Request path (e.g., "/v1/chat/completions", "/agents/v1/chat/completions")
|
||||
pub fn with_path(mut self, path: impl Into<String>) -> Self {
|
||||
self.path = Some(path.into());
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the target (model name, agent name, or filter name)
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `target` - Target identifier (e.g., "gpt-4", "my-agent", "hallucination-detector")
|
||||
pub fn with_target(mut self, target: impl Into<String>) -> Self {
|
||||
self.target = Some(target.into());
|
||||
self
|
||||
}
|
||||
|
||||
/// Build the operation name string
|
||||
///
|
||||
/// # Format
|
||||
/// - With all components: `{method} {path} {target}`
|
||||
/// - Without target: `{method} {path}`
|
||||
/// - Without path: `{method}`
|
||||
/// - Empty: returns empty string
|
||||
pub fn build(self) -> String {
|
||||
let mut parts = Vec::new();
|
||||
|
||||
if let Some(method) = self.method {
|
||||
parts.push(method);
|
||||
}
|
||||
|
||||
if let Some(path) = self.path {
|
||||
parts.push(path);
|
||||
}
|
||||
|
||||
if let Some(target) = self.target {
|
||||
parts.push(target);
|
||||
}
|
||||
|
||||
parts.join(" ")
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for OperationNameBuilder {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_operation_name_full() {
|
||||
let op = OperationNameBuilder::new()
|
||||
.with_method("POST")
|
||||
.with_path("/v1/chat/completions")
|
||||
.with_target("gpt-4")
|
||||
.build();
|
||||
|
||||
assert_eq!(op, "POST /v1/chat/completions gpt-4");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_operation_name_no_target() {
|
||||
let op = OperationNameBuilder::new()
|
||||
.with_method("POST")
|
||||
.with_path("/v1/chat/completions")
|
||||
.build();
|
||||
|
||||
assert_eq!(op, "POST /v1/chat/completions");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_operation_name_agent_filter() {
|
||||
let op = OperationNameBuilder::new()
|
||||
.with_method("POST")
|
||||
.with_path("/agents/v1/chat/completions")
|
||||
.with_target("content-filter")
|
||||
.build();
|
||||
|
||||
assert_eq!(op, "POST /agents/v1/chat/completions content-filter");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_operation_name_minimal() {
|
||||
let op = OperationNameBuilder::new().build();
|
||||
assert_eq!(op, "");
|
||||
}
|
||||
}
|
||||
3
crates/brightstaff/src/tracing/mod.rs
Normal file
3
crates/brightstaff/src/tracing/mod.rs
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
mod constants;
|
||||
|
||||
pub use constants::{OperationNameBuilder, operation_component, http, llm, error, routing};
|
||||
Loading…
Add table
Add a link
Reference in a new issue