mirror of
https://github.com/katanemo/plano.git
synced 2026-05-06 06:12:59 +02:00
Split arch wasm filter code into prompt and llm gateway filters (#190)
This commit is contained in:
parent
8e54ac20d8
commit
21e7fe2cef
13 changed files with 696 additions and 2801 deletions
|
|
@ -1,322 +0,0 @@
|
|||
use crate::stream_context::StreamContext;
|
||||
use common::common_types::EmbeddingType;
|
||||
use common::configuration::{Configuration, GatewayMode, Overrides, PromptGuards, PromptTarget};
|
||||
use common::consts::ARCH_INTERNAL_CLUSTER_NAME;
|
||||
use common::consts::ARCH_UPSTREAM_HOST_HEADER;
|
||||
use common::consts::DEFAULT_EMBEDDING_MODEL;
|
||||
use common::consts::MODEL_SERVER_NAME;
|
||||
use common::embeddings::{
|
||||
CreateEmbeddingRequest, CreateEmbeddingRequestInput, CreateEmbeddingResponse,
|
||||
};
|
||||
use common::http::CallArgs;
|
||||
use common::http::Client;
|
||||
use common::llm_providers::LlmProviders;
|
||||
use common::ratelimit;
|
||||
use common::stats::Counter;
|
||||
use common::stats::Gauge;
|
||||
use common::stats::IncrementingMetric;
|
||||
use log::debug;
|
||||
use proxy_wasm::traits::*;
|
||||
use proxy_wasm::types::*;
|
||||
use std::cell::RefCell;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::rc::Rc;
|
||||
use std::time::Duration;
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct WasmMetrics {
|
||||
pub active_http_calls: Gauge,
|
||||
pub ratelimited_rq: Counter,
|
||||
}
|
||||
|
||||
impl WasmMetrics {
|
||||
fn new() -> WasmMetrics {
|
||||
WasmMetrics {
|
||||
active_http_calls: Gauge::new(String::from("active_http_calls")),
|
||||
ratelimited_rq: Counter::new(String::from("ratelimited_rq")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub type EmbeddingTypeMap = HashMap<EmbeddingType, Vec<f64>>;
|
||||
pub type EmbeddingsStore = HashMap<String, EmbeddingTypeMap>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FilterCallContext {
|
||||
pub prompt_target_name: String,
|
||||
pub embedding_type: EmbeddingType,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FilterContext {
|
||||
metrics: Rc<WasmMetrics>,
|
||||
// callouts stores token_id to request mapping that we use during #on_http_call_response to match the response to the request.
|
||||
callouts: RefCell<HashMap<u32, FilterCallContext>>,
|
||||
overrides: Rc<Option<Overrides>>,
|
||||
system_prompt: Rc<Option<String>>,
|
||||
prompt_targets: Rc<HashMap<String, PromptTarget>>,
|
||||
mode: GatewayMode,
|
||||
prompt_guards: Rc<PromptGuards>,
|
||||
llm_providers: Option<Rc<LlmProviders>>,
|
||||
embeddings_store: Option<Rc<EmbeddingsStore>>,
|
||||
temp_embeddings_store: EmbeddingsStore,
|
||||
}
|
||||
|
||||
impl FilterContext {
|
||||
pub fn new() -> FilterContext {
|
||||
FilterContext {
|
||||
callouts: RefCell::new(HashMap::new()),
|
||||
metrics: Rc::new(WasmMetrics::new()),
|
||||
system_prompt: Rc::new(None),
|
||||
prompt_targets: Rc::new(HashMap::new()),
|
||||
overrides: Rc::new(None),
|
||||
prompt_guards: Rc::new(PromptGuards::default()),
|
||||
mode: GatewayMode::Prompt,
|
||||
llm_providers: None,
|
||||
embeddings_store: Some(Rc::new(HashMap::new())),
|
||||
temp_embeddings_store: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn process_prompt_targets(&self) {
|
||||
for values in self.prompt_targets.iter() {
|
||||
let prompt_target = values.1;
|
||||
self.schedule_embeddings_call(
|
||||
&prompt_target.name,
|
||||
&prompt_target.description,
|
||||
EmbeddingType::Description,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn schedule_embeddings_call(
|
||||
&self,
|
||||
prompt_target_name: &str,
|
||||
input: &str,
|
||||
embedding_type: EmbeddingType,
|
||||
) {
|
||||
let embeddings_input = CreateEmbeddingRequest {
|
||||
input: Box::new(CreateEmbeddingRequestInput::String(String::from(input))),
|
||||
model: String::from(DEFAULT_EMBEDDING_MODEL),
|
||||
encoding_format: None,
|
||||
dimensions: None,
|
||||
user: None,
|
||||
};
|
||||
let json_data = serde_json::to_string(&embeddings_input).unwrap();
|
||||
|
||||
let call_args = CallArgs::new(
|
||||
ARCH_INTERNAL_CLUSTER_NAME,
|
||||
"/embeddings",
|
||||
vec![
|
||||
(ARCH_UPSTREAM_HOST_HEADER, MODEL_SERVER_NAME),
|
||||
(":method", "POST"),
|
||||
(":path", "/embeddings"),
|
||||
(":authority", MODEL_SERVER_NAME),
|
||||
("content-type", "application/json"),
|
||||
("x-envoy-upstream-rq-timeout-ms", "60000"),
|
||||
],
|
||||
Some(json_data.as_bytes()),
|
||||
vec![],
|
||||
Duration::from_secs(60),
|
||||
);
|
||||
|
||||
let call_context = crate::filter_context::FilterCallContext {
|
||||
prompt_target_name: String::from(prompt_target_name),
|
||||
embedding_type,
|
||||
};
|
||||
|
||||
if let Err(error) = self.http_call(call_args, call_context) {
|
||||
panic!("{error}")
|
||||
}
|
||||
}
|
||||
|
||||
fn embedding_response_handler(
|
||||
&mut self,
|
||||
body_size: usize,
|
||||
embedding_type: EmbeddingType,
|
||||
prompt_target_name: String,
|
||||
) {
|
||||
let prompt_target = self
|
||||
.prompt_targets
|
||||
.get(&prompt_target_name)
|
||||
.unwrap_or_else(|| {
|
||||
panic!(
|
||||
"Received embeddings response for unknown prompt target name={}",
|
||||
prompt_target_name
|
||||
)
|
||||
});
|
||||
|
||||
let body = self
|
||||
.get_http_call_response_body(0, body_size)
|
||||
.expect("No body in response");
|
||||
if !body.is_empty() {
|
||||
let mut embedding_response: CreateEmbeddingResponse =
|
||||
match serde_json::from_slice(&body) {
|
||||
Ok(response) => response,
|
||||
Err(e) => {
|
||||
panic!(
|
||||
"Error deserializing embedding response. body: {:?}: {:?}",
|
||||
String::from_utf8(body).unwrap(),
|
||||
e
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
let embeddings = embedding_response.data.remove(0).embedding;
|
||||
debug!(
|
||||
"Adding embeddings for prompt target name: {:?}, description: {:?}, embedding type: {:?}",
|
||||
prompt_target.name,
|
||||
prompt_target.description,
|
||||
embedding_type
|
||||
);
|
||||
|
||||
let entry = self.temp_embeddings_store.entry(prompt_target_name);
|
||||
match entry {
|
||||
Entry::Occupied(_) => {
|
||||
entry.and_modify(|e| {
|
||||
if let Entry::Vacant(e) = e.entry(embedding_type) {
|
||||
e.insert(embeddings);
|
||||
} else {
|
||||
panic!(
|
||||
"Duplicate {:?} for prompt target with name=\"{}\"",
|
||||
&embedding_type, prompt_target.name
|
||||
)
|
||||
}
|
||||
});
|
||||
}
|
||||
Entry::Vacant(_) => {
|
||||
entry.or_insert(HashMap::from([(embedding_type, embeddings)]));
|
||||
}
|
||||
}
|
||||
|
||||
if self.prompt_targets.len() == self.temp_embeddings_store.len() {
|
||||
self.embeddings_store =
|
||||
Some(Rc::new(std::mem::take(&mut self.temp_embeddings_store)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Client for FilterContext {
|
||||
type CallContext = FilterCallContext;
|
||||
|
||||
fn callouts(&self) -> &RefCell<HashMap<u32, Self::CallContext>> {
|
||||
&self.callouts
|
||||
}
|
||||
|
||||
fn active_http_calls(&self) -> &Gauge {
|
||||
&self.metrics.active_http_calls
|
||||
}
|
||||
}
|
||||
|
||||
impl Context for FilterContext {
|
||||
fn on_http_call_response(
|
||||
&mut self,
|
||||
token_id: u32,
|
||||
_num_headers: usize,
|
||||
body_size: usize,
|
||||
_num_trailers: usize,
|
||||
) {
|
||||
debug!(
|
||||
"filter_context: on_http_call_response called with token_id: {:?}",
|
||||
token_id
|
||||
);
|
||||
let callout_data = self
|
||||
.callouts
|
||||
.borrow_mut()
|
||||
.remove(&token_id)
|
||||
.expect("invalid token_id");
|
||||
|
||||
self.metrics.active_http_calls.increment(-1);
|
||||
|
||||
self.embedding_response_handler(
|
||||
body_size,
|
||||
callout_data.embedding_type,
|
||||
callout_data.prompt_target_name,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// RootContext allows the Rust code to reach into the Envoy Config
|
||||
impl RootContext for FilterContext {
|
||||
fn on_configure(&mut self, _: usize) -> bool {
|
||||
let config_bytes = self
|
||||
.get_plugin_configuration()
|
||||
.expect("Arch config cannot be empty");
|
||||
|
||||
let config: Configuration = match serde_yaml::from_slice(&config_bytes) {
|
||||
Ok(config) => config,
|
||||
Err(err) => panic!("Invalid arch config \"{:?}\"", err),
|
||||
};
|
||||
|
||||
self.overrides = Rc::new(config.overrides);
|
||||
|
||||
let mut prompt_targets = HashMap::new();
|
||||
for pt in config.prompt_targets {
|
||||
prompt_targets.insert(pt.name.clone(), pt.clone());
|
||||
}
|
||||
self.system_prompt = Rc::new(config.system_prompt);
|
||||
self.prompt_targets = Rc::new(prompt_targets);
|
||||
self.mode = config.mode.unwrap_or_default();
|
||||
|
||||
ratelimit::ratelimits(Some(config.ratelimits.unwrap_or_default()));
|
||||
|
||||
if let Some(prompt_guards) = config.prompt_guards {
|
||||
self.prompt_guards = Rc::new(prompt_guards)
|
||||
}
|
||||
|
||||
match config.llm_providers.try_into() {
|
||||
Ok(llm_providers) => self.llm_providers = Some(Rc::new(llm_providers)),
|
||||
Err(err) => panic!("{err}"),
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
fn create_http_context(&self, context_id: u32) -> Option<Box<dyn HttpContext>> {
|
||||
debug!(
|
||||
"||| create_http_context called with context_id: {:?} |||",
|
||||
context_id
|
||||
);
|
||||
|
||||
// No StreamContext can be created until the Embedding Store is fully initialized.
|
||||
let embedding_store = match self.mode {
|
||||
GatewayMode::Llm => None,
|
||||
GatewayMode::Prompt => Some(Rc::clone(self.embeddings_store.as_ref().unwrap())),
|
||||
};
|
||||
Some(Box::new(StreamContext::new(
|
||||
context_id,
|
||||
Rc::clone(&self.metrics),
|
||||
Rc::clone(&self.system_prompt),
|
||||
Rc::clone(&self.prompt_targets),
|
||||
Rc::clone(&self.prompt_guards),
|
||||
Rc::clone(&self.overrides),
|
||||
Rc::clone(
|
||||
self.llm_providers
|
||||
.as_ref()
|
||||
.expect("LLM Providers must exist when Streams are being created"),
|
||||
),
|
||||
embedding_store,
|
||||
self.mode.clone(),
|
||||
)))
|
||||
}
|
||||
|
||||
fn get_type(&self) -> Option<ContextType> {
|
||||
Some(ContextType::HttpContext)
|
||||
}
|
||||
|
||||
fn on_vm_start(&mut self, _: usize) -> bool {
|
||||
self.set_tick_period(Duration::from_secs(1));
|
||||
true
|
||||
}
|
||||
|
||||
fn on_tick(&mut self) {
|
||||
debug!("starting up arch filter in mode: {:?}", self.mode);
|
||||
if self.mode == GatewayMode::Prompt {
|
||||
self.process_prompt_targets();
|
||||
}
|
||||
|
||||
self.set_tick_period(Duration::from_secs(0));
|
||||
}
|
||||
}
|
||||
|
|
@ -1,13 +1,13 @@
|
|||
use filter_context::FilterContext;
|
||||
use llm_filter_context::LlmGatewayFilterContext;
|
||||
use proxy_wasm::traits::*;
|
||||
use proxy_wasm::types::*;
|
||||
|
||||
mod filter_context;
|
||||
mod stream_context;
|
||||
mod llm_filter_context;
|
||||
mod llm_stream_context;
|
||||
|
||||
proxy_wasm::main! {{
|
||||
proxy_wasm::set_log_level(LogLevel::Trace);
|
||||
proxy_wasm::set_root_context(|_| -> Box<dyn RootContext> {
|
||||
Box::new(FilterContext::new())
|
||||
Box::new(LlmGatewayFilterContext::new())
|
||||
});
|
||||
}}
|
||||
|
|
|
|||
108
crates/llm_gateway/src/llm_filter_context.rs
Normal file
108
crates/llm_gateway/src/llm_filter_context.rs
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
use crate::llm_stream_context::LlmGatewayStreamContext;
|
||||
use common::configuration::Configuration;
|
||||
use common::http::Client;
|
||||
use common::llm_providers::LlmProviders;
|
||||
use common::ratelimit;
|
||||
use common::stats::Counter;
|
||||
use common::stats::Gauge;
|
||||
use log::debug;
|
||||
use proxy_wasm::traits::*;
|
||||
use proxy_wasm::types::*;
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashMap;
|
||||
use std::rc::Rc;
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct WasmMetrics {
|
||||
pub active_http_calls: Gauge,
|
||||
pub ratelimited_rq: Counter,
|
||||
}
|
||||
|
||||
impl WasmMetrics {
|
||||
fn new() -> WasmMetrics {
|
||||
WasmMetrics {
|
||||
active_http_calls: Gauge::new(String::from("active_http_calls")),
|
||||
ratelimited_rq: Counter::new(String::from("ratelimited_rq")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FilterCallContext {}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct LlmGatewayFilterContext {
|
||||
metrics: Rc<WasmMetrics>,
|
||||
// callouts stores token_id to request mapping that we use during #on_http_call_response to match the response to the request.
|
||||
callouts: RefCell<HashMap<u32, FilterCallContext>>,
|
||||
llm_providers: Option<Rc<LlmProviders>>,
|
||||
}
|
||||
|
||||
impl LlmGatewayFilterContext {
|
||||
pub fn new() -> LlmGatewayFilterContext {
|
||||
LlmGatewayFilterContext {
|
||||
callouts: RefCell::new(HashMap::new()),
|
||||
metrics: Rc::new(WasmMetrics::new()),
|
||||
llm_providers: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Client for LlmGatewayFilterContext {
|
||||
type CallContext = FilterCallContext;
|
||||
|
||||
fn callouts(&self) -> &RefCell<HashMap<u32, Self::CallContext>> {
|
||||
&self.callouts
|
||||
}
|
||||
|
||||
fn active_http_calls(&self) -> &Gauge {
|
||||
&self.metrics.active_http_calls
|
||||
}
|
||||
}
|
||||
|
||||
impl Context for LlmGatewayFilterContext {}
|
||||
|
||||
// RootContext allows the Rust code to reach into the Envoy Config
|
||||
impl RootContext for LlmGatewayFilterContext {
|
||||
fn on_configure(&mut self, _: usize) -> bool {
|
||||
let config_bytes = self
|
||||
.get_plugin_configuration()
|
||||
.expect("Arch config cannot be empty");
|
||||
|
||||
let config: Configuration = match serde_yaml::from_slice(&config_bytes) {
|
||||
Ok(config) => config,
|
||||
Err(err) => panic!("Invalid arch config \"{:?}\"", err),
|
||||
};
|
||||
|
||||
ratelimit::ratelimits(Some(config.ratelimits.unwrap_or_default()));
|
||||
|
||||
match config.llm_providers.try_into() {
|
||||
Ok(llm_providers) => self.llm_providers = Some(Rc::new(llm_providers)),
|
||||
Err(err) => panic!("{err}"),
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
fn create_http_context(&self, context_id: u32) -> Option<Box<dyn HttpContext>> {
|
||||
debug!(
|
||||
"||| create_http_context called with context_id: {:?} |||",
|
||||
context_id
|
||||
);
|
||||
|
||||
// No StreamContext can be created until the Embedding Store is fully initialized.
|
||||
Some(Box::new(LlmGatewayStreamContext::new(
|
||||
context_id,
|
||||
Rc::clone(&self.metrics),
|
||||
Rc::clone(
|
||||
self.llm_providers
|
||||
.as_ref()
|
||||
.expect("LLM Providers must exist when Streams are being created"),
|
||||
),
|
||||
)))
|
||||
}
|
||||
|
||||
fn get_type(&self) -> Option<ContextType> {
|
||||
Some(ContextType::HttpContext)
|
||||
}
|
||||
}
|
||||
421
crates/llm_gateway/src/llm_stream_context.rs
Normal file
421
crates/llm_gateway/src/llm_stream_context.rs
Normal file
|
|
@ -0,0 +1,421 @@
|
|||
use crate::llm_filter_context::WasmMetrics;
|
||||
use common::common_types::open_ai::{
|
||||
ArchState, ChatCompletionChunkResponse, ChatCompletionsRequest, ChatCompletionsResponse,
|
||||
Message, ToolCall, ToolCallState,
|
||||
};
|
||||
use common::configuration::LlmProvider;
|
||||
use common::consts::{
|
||||
ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, ARCH_STATE_HEADER, CHAT_COMPLETIONS_PATH,
|
||||
RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, USER_ROLE,
|
||||
};
|
||||
use common::llm_providers::LlmProviders;
|
||||
use common::ratelimit::Header;
|
||||
use common::{ratelimit, routing, tokenizer};
|
||||
use http::StatusCode;
|
||||
use log::debug;
|
||||
use proxy_wasm::traits::*;
|
||||
use proxy_wasm::types::*;
|
||||
use serde_json::Value;
|
||||
use sha2::{Digest, Sha256};
|
||||
use std::num::NonZero;
|
||||
use std::rc::Rc;
|
||||
|
||||
use common::stats::IncrementingMetric;
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum ServerError {
|
||||
#[error(transparent)]
|
||||
Deserialization(serde_json::Error),
|
||||
#[error("{0}")]
|
||||
LogicError(String),
|
||||
#[error(transparent)]
|
||||
ExceededRatelimit(ratelimit::Error),
|
||||
#[error("{why}")]
|
||||
BadRequest { why: String },
|
||||
}
|
||||
|
||||
pub struct LlmGatewayStreamContext {
|
||||
context_id: u32,
|
||||
metrics: Rc<WasmMetrics>,
|
||||
tool_calls: Option<Vec<ToolCall>>,
|
||||
tool_call_response: Option<String>,
|
||||
arch_state: Option<Vec<ArchState>>,
|
||||
request_body_size: usize,
|
||||
ratelimit_selector: Option<Header>,
|
||||
streaming_response: bool,
|
||||
user_prompt: Option<Message>,
|
||||
response_tokens: usize,
|
||||
is_chat_completions_request: bool,
|
||||
chat_completions_request: Option<ChatCompletionsRequest>,
|
||||
llm_providers: Rc<LlmProviders>,
|
||||
llm_provider: Option<Rc<LlmProvider>>,
|
||||
request_id: Option<String>,
|
||||
}
|
||||
|
||||
impl LlmGatewayStreamContext {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(context_id: u32, metrics: Rc<WasmMetrics>, llm_providers: Rc<LlmProviders>) -> Self {
|
||||
LlmGatewayStreamContext {
|
||||
context_id,
|
||||
metrics,
|
||||
chat_completions_request: None,
|
||||
tool_calls: None,
|
||||
tool_call_response: None,
|
||||
arch_state: None,
|
||||
request_body_size: 0,
|
||||
ratelimit_selector: None,
|
||||
streaming_response: false,
|
||||
user_prompt: None,
|
||||
response_tokens: 0,
|
||||
is_chat_completions_request: false,
|
||||
llm_providers,
|
||||
llm_provider: None,
|
||||
request_id: None,
|
||||
}
|
||||
}
|
||||
fn llm_provider(&self) -> &LlmProvider {
|
||||
self.llm_provider
|
||||
.as_ref()
|
||||
.expect("the provider should be set when asked for it")
|
||||
}
|
||||
|
||||
fn select_llm_provider(&mut self) {
|
||||
let provider_hint = self
|
||||
.get_http_request_header(ARCH_PROVIDER_HINT_HEADER)
|
||||
.map(|provider_name| provider_name.into());
|
||||
|
||||
debug!("llm provider hint: {:?}", provider_hint);
|
||||
self.llm_provider = Some(routing::get_llm_provider(
|
||||
&self.llm_providers,
|
||||
provider_hint,
|
||||
));
|
||||
debug!("selected llm: {}", self.llm_provider.as_ref().unwrap().name);
|
||||
}
|
||||
|
||||
fn modify_auth_headers(&mut self) -> Result<(), ServerError> {
|
||||
let llm_provider_api_key_value =
|
||||
self.llm_provider()
|
||||
.access_key
|
||||
.as_ref()
|
||||
.ok_or(ServerError::BadRequest {
|
||||
why: format!(
|
||||
"No access key configured for selected LLM Provider \"{}\"",
|
||||
self.llm_provider()
|
||||
),
|
||||
})?;
|
||||
|
||||
let authorization_header_value = format!("Bearer {}", llm_provider_api_key_value);
|
||||
|
||||
self.set_http_request_header("Authorization", Some(&authorization_header_value));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete_content_length_header(&mut self) {
|
||||
// Remove the Content-Length header because further body manipulations in the gateway logic will invalidate it.
|
||||
// Server's generally throw away requests whose body length do not match the Content-Length header.
|
||||
// However, a missing Content-Length header is not grounds for bad requests given that intermediary hops could
|
||||
// manipulate the body in benign ways e.g., compression.
|
||||
self.set_http_request_header("content-length", None);
|
||||
}
|
||||
|
||||
fn save_ratelimit_header(&mut self) {
|
||||
self.ratelimit_selector = self
|
||||
.get_http_request_header(RATELIMIT_SELECTOR_HEADER_KEY)
|
||||
.and_then(|key| {
|
||||
self.get_http_request_header(&key)
|
||||
.map(|value| Header { key, value })
|
||||
});
|
||||
}
|
||||
|
||||
fn send_server_error(&self, error: ServerError, override_status_code: Option<StatusCode>) {
|
||||
debug!("server error occurred: {}", error);
|
||||
self.send_http_response(
|
||||
override_status_code
|
||||
.unwrap_or(StatusCode::INTERNAL_SERVER_ERROR)
|
||||
.as_u16()
|
||||
.into(),
|
||||
vec![],
|
||||
Some(format!("{error}").as_bytes()),
|
||||
);
|
||||
}
|
||||
|
||||
fn enforce_ratelimits(
|
||||
&mut self,
|
||||
model: &str,
|
||||
json_string: &str,
|
||||
) -> Result<(), ratelimit::Error> {
|
||||
if let Some(selector) = self.ratelimit_selector.take() {
|
||||
// Tokenize and Ratelimit.
|
||||
if let Ok(token_count) = tokenizer::token_count(model, json_string) {
|
||||
ratelimit::ratelimits(None).read().unwrap().check_limit(
|
||||
model.to_owned(),
|
||||
selector,
|
||||
NonZero::new(token_count as u32).unwrap(),
|
||||
)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// HttpContext is the trait that allows the Rust code to interact with HTTP objects.
|
||||
impl HttpContext for LlmGatewayStreamContext {
|
||||
// Envoy's HTTP model is event driven. The WASM ABI has given implementors events to hook onto
|
||||
// the lifecycle of the http request and response.
|
||||
fn on_http_request_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action {
|
||||
self.select_llm_provider();
|
||||
self.add_http_request_header(ARCH_ROUTING_HEADER, &self.llm_provider().name);
|
||||
|
||||
if let Err(error) = self.modify_auth_headers() {
|
||||
self.send_server_error(error, Some(StatusCode::BAD_REQUEST));
|
||||
}
|
||||
self.delete_content_length_header();
|
||||
self.save_ratelimit_header();
|
||||
|
||||
self.is_chat_completions_request =
|
||||
self.get_http_request_header(":path").unwrap_or_default() == CHAT_COMPLETIONS_PATH;
|
||||
|
||||
debug!(
|
||||
"on_http_request_headers S[{}] req_headers={:?}",
|
||||
self.context_id,
|
||||
self.get_http_request_headers()
|
||||
);
|
||||
|
||||
self.request_id = self.get_http_request_header(REQUEST_ID_HEADER);
|
||||
|
||||
Action::Continue
|
||||
}
|
||||
|
||||
fn on_http_request_body(&mut self, body_size: usize, end_of_stream: bool) -> Action {
|
||||
// Let the client send the gateway all the data before sending to the LLM_provider.
|
||||
// TODO: consider a streaming API.
|
||||
if !end_of_stream {
|
||||
return Action::Pause;
|
||||
}
|
||||
|
||||
if body_size == 0 {
|
||||
return Action::Continue;
|
||||
}
|
||||
|
||||
self.request_body_size = body_size;
|
||||
|
||||
// Deserialize body into spec.
|
||||
// Currently OpenAI API.
|
||||
let mut deserialized_body: ChatCompletionsRequest =
|
||||
match self.get_http_request_body(0, body_size) {
|
||||
Some(body_bytes) => match serde_json::from_slice(&body_bytes) {
|
||||
Ok(deserialized) => deserialized,
|
||||
Err(e) => {
|
||||
self.send_server_error(
|
||||
ServerError::Deserialization(e),
|
||||
Some(StatusCode::BAD_REQUEST),
|
||||
);
|
||||
return Action::Pause;
|
||||
}
|
||||
},
|
||||
None => {
|
||||
self.send_server_error(
|
||||
ServerError::LogicError(format!(
|
||||
"Failed to obtain body bytes even though body_size is {}",
|
||||
body_size
|
||||
)),
|
||||
None,
|
||||
);
|
||||
return Action::Pause;
|
||||
}
|
||||
};
|
||||
self.is_chat_completions_request = true;
|
||||
|
||||
// remove metadata from the request body
|
||||
deserialized_body.metadata = None;
|
||||
// delete model key from message array
|
||||
for message in deserialized_body.messages.iter_mut() {
|
||||
message.model = None;
|
||||
}
|
||||
|
||||
// override model name from the llm provider
|
||||
deserialized_body
|
||||
.model
|
||||
.clone_from(&self.llm_provider.as_ref().unwrap().model);
|
||||
let chat_completion_request_str = serde_json::to_string(&deserialized_body).unwrap();
|
||||
|
||||
// enforce ratelimits on ingress
|
||||
if let Err(e) =
|
||||
self.enforce_ratelimits(&deserialized_body.model, &chat_completion_request_str)
|
||||
{
|
||||
self.send_server_error(
|
||||
ServerError::ExceededRatelimit(e),
|
||||
Some(StatusCode::TOO_MANY_REQUESTS),
|
||||
);
|
||||
self.metrics.ratelimited_rq.increment(1);
|
||||
return Action::Continue;
|
||||
}
|
||||
|
||||
debug!(
|
||||
"arch => {:?}, body: {}",
|
||||
deserialized_body.model, chat_completion_request_str
|
||||
);
|
||||
self.set_http_request_body(0, body_size, chat_completion_request_str.as_bytes());
|
||||
|
||||
Action::Continue
|
||||
}
|
||||
|
||||
fn on_http_response_body(&mut self, body_size: usize, end_of_stream: bool) -> Action {
|
||||
debug!(
|
||||
"recv [S={}] bytes={} end_stream={}",
|
||||
self.context_id, body_size, end_of_stream
|
||||
);
|
||||
|
||||
if !self.is_chat_completions_request {
|
||||
if let Some(body_str) = self
|
||||
.get_http_response_body(0, body_size)
|
||||
.and_then(|bytes| String::from_utf8(bytes).ok())
|
||||
{
|
||||
debug!("recv [S={}] body_str={}", self.context_id, body_str);
|
||||
}
|
||||
return Action::Continue;
|
||||
}
|
||||
|
||||
if !end_of_stream {
|
||||
return Action::Pause;
|
||||
}
|
||||
|
||||
let body = self
|
||||
.get_http_response_body(0, body_size)
|
||||
.expect("cant get response body");
|
||||
|
||||
if self.streaming_response {
|
||||
let body_str = String::from_utf8(body).expect("body is not utf-8");
|
||||
debug!("streaming response");
|
||||
let chat_completions_data = match body_str.split_once("data: ") {
|
||||
Some((_, chat_completions_data)) => chat_completions_data,
|
||||
None => {
|
||||
self.send_server_error(
|
||||
ServerError::LogicError(String::from("parsing error in streaming data")),
|
||||
None,
|
||||
);
|
||||
return Action::Pause;
|
||||
}
|
||||
};
|
||||
|
||||
let chat_completions_chunk_response: ChatCompletionChunkResponse =
|
||||
match serde_json::from_str(chat_completions_data) {
|
||||
Ok(de) => de,
|
||||
Err(_) => {
|
||||
if chat_completions_data != "[NONE]" {
|
||||
self.send_server_error(
|
||||
ServerError::LogicError(String::from(
|
||||
"error in streaming response",
|
||||
)),
|
||||
None,
|
||||
);
|
||||
return Action::Continue;
|
||||
}
|
||||
return Action::Continue;
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(content) = chat_completions_chunk_response
|
||||
.choices
|
||||
.first()
|
||||
.unwrap()
|
||||
.delta
|
||||
.content
|
||||
.as_ref()
|
||||
{
|
||||
let model = &chat_completions_chunk_response.model;
|
||||
let token_count = tokenizer::token_count(model, content).unwrap_or(0);
|
||||
self.response_tokens += token_count;
|
||||
}
|
||||
} else {
|
||||
debug!("non streaming response");
|
||||
let chat_completions_response: ChatCompletionsResponse =
|
||||
match serde_json::from_slice(&body) {
|
||||
Ok(de) => de,
|
||||
Err(e) => {
|
||||
debug!("invalid response: {}", String::from_utf8_lossy(&body));
|
||||
self.send_server_error(ServerError::Deserialization(e), None);
|
||||
return Action::Pause;
|
||||
}
|
||||
};
|
||||
|
||||
if chat_completions_response.usage.is_some() {
|
||||
self.response_tokens += chat_completions_response
|
||||
.usage
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.completion_tokens;
|
||||
}
|
||||
|
||||
if let Some(tool_calls) = self.tool_calls.as_ref() {
|
||||
if !tool_calls.is_empty() {
|
||||
if self.arch_state.is_none() {
|
||||
self.arch_state = Some(Vec::new());
|
||||
}
|
||||
|
||||
// compute sha hash from message history
|
||||
let mut hasher = Sha256::new();
|
||||
let prompts: Vec<String> = self
|
||||
.chat_completions_request
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.messages
|
||||
.iter()
|
||||
.filter(|msg| msg.role == USER_ROLE)
|
||||
.map(|msg| msg.content.clone().unwrap())
|
||||
.collect();
|
||||
let prompts_merged = prompts.join("#.#");
|
||||
hasher.update(prompts_merged.clone());
|
||||
let hash_key = hasher.finalize();
|
||||
// conver hash to hex string
|
||||
let hash_key_str = format!("{:x}", hash_key);
|
||||
debug!("hash key: {}, prompts: {}", hash_key_str, prompts_merged);
|
||||
|
||||
// create new tool call state
|
||||
let tool_call_state = ToolCallState {
|
||||
key: hash_key_str,
|
||||
message: self.user_prompt.clone(),
|
||||
tool_call: tool_calls[0].function.clone(),
|
||||
tool_response: self.tool_call_response.clone().unwrap(),
|
||||
};
|
||||
|
||||
// push tool call state to arch state
|
||||
self.arch_state
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.push(ArchState::ToolCall(vec![tool_call_state]));
|
||||
|
||||
let mut data: Value = serde_json::from_slice(&body).unwrap();
|
||||
// use serde::Value to manipulate the json object and ensure that we don't lose any data
|
||||
if let Value::Object(ref mut map) = data {
|
||||
// serialize arch state and add to metadata
|
||||
let arch_state_str = serde_json::to_string(&self.arch_state).unwrap();
|
||||
debug!("arch_state: {}", arch_state_str);
|
||||
let metadata = map
|
||||
.entry("metadata")
|
||||
.or_insert(Value::Object(serde_json::Map::new()));
|
||||
metadata.as_object_mut().unwrap().insert(
|
||||
ARCH_STATE_HEADER.to_string(),
|
||||
serde_json::Value::String(arch_state_str),
|
||||
);
|
||||
|
||||
let data_serialized = serde_json::to_string(&data).unwrap();
|
||||
debug!("arch => user: {}", data_serialized);
|
||||
self.set_http_response_body(0, body_size, data_serialized.as_bytes());
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!(
|
||||
"recv [S={}] total_tokens={} end_stream={}",
|
||||
self.context_id, self.response_tokens, end_of_stream
|
||||
);
|
||||
|
||||
// TODO:: ratelimit based on response tokens.
|
||||
Action::Continue
|
||||
}
|
||||
}
|
||||
|
||||
impl Context for LlmGatewayStreamContext {}
|
||||
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue