mirror of
https://github.com/katanemo/plano.git
synced 2026-06-17 15:25:17 +02:00
Merge branch 'main' into adil/signoz_tracing
This commit is contained in:
commit
4a635cc6e4
7 changed files with 287 additions and 42 deletions
76
api_llm_gateway.rest
Normal file
76
api_llm_gateway.rest
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
@llm_endpoint = http://localhost:12000
|
||||
@openai_endpoint = https://api.openai.com
|
||||
@access_key = {{$dotenv OPENAI_API_KEY}}
|
||||
|
||||
### openai request
|
||||
POST {{openai_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
Content-Type: application/json
|
||||
Authorization: Bearer {{access_key}}
|
||||
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "hello"
|
||||
}
|
||||
],
|
||||
"model": "gpt-4o-mini"
|
||||
}
|
||||
|
||||
### openai request (streaming)
|
||||
POST {{openai_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
Content-Type: application/json
|
||||
Authorization: Bearer {{access_key}}
|
||||
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "hello"
|
||||
}
|
||||
],
|
||||
"model": "gpt-4o-mini",
|
||||
"stream": true
|
||||
}
|
||||
|
||||
|
||||
### llm gateway request
|
||||
POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "hello"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
### llm gateway request (streaming)
|
||||
POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "hello"
|
||||
}
|
||||
],
|
||||
"stream": true
|
||||
}
|
||||
|
||||
### llm gateway request (provider hint)
|
||||
POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
Content-Type: application/json
|
||||
x-arch-llm-provider-hint: gpt-3.5-turbo-0125
|
||||
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "hello"
|
||||
}
|
||||
]
|
||||
}
|
||||
44
api_model_server.rest
Normal file
44
api_model_server.rest
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
@model_server_endpoint = http://localhost:51000
|
||||
@archfc_endpoint = https://api.fc.archgw.com
|
||||
|
||||
### talk to model_server for completion
|
||||
POST {{model_server_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "how is the weather in seattle for next 10 days"
|
||||
}
|
||||
],
|
||||
"tools": [
|
||||
{
|
||||
"id": "weather-112",
|
||||
"tool_type": "function",
|
||||
"function": {
|
||||
"name": "weather_forecast",
|
||||
"arguments": {"city": "str", "days": "int"}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
### talk to arch_fc directly for completion
|
||||
POST {{archfc_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"model": "Arch-Function",
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant.\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"id\": \"weather-112\", \"tool_type\": \"function\", \"function\": {\"name\": \"weather_forecast\", \"arguments\": {\"city\": \"str\", \"days\": \"int\"}}}\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>"
|
||||
},
|
||||
{ "role": "user", "content": "how is the weather in seattle?" },
|
||||
{ "role": "assistant", "content": "Of course! " }
|
||||
],
|
||||
"continue_final_message": true,
|
||||
"add_generation_prompt": false
|
||||
}
|
||||
87
api_prompt_gateway.rest
Normal file
87
api_prompt_gateway.rest
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
@prompt_endpoint = http://localhost:10000
|
||||
|
||||
### prompt gateway request
|
||||
POST {{prompt_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "how is the weather in seattle for next 10 days"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
### prompt gateway request (streaming)
|
||||
POST {{prompt_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "how is the weather in seattle for next 10 days"
|
||||
}
|
||||
],
|
||||
"stream": true
|
||||
}
|
||||
|
||||
|
||||
### prompt gateway request param gathering
|
||||
POST {{prompt_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "how is the weather in seattle"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
### prompt gateway request param gathering and function calling
|
||||
POST {{prompt_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "how is the weather in seattle"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "It seems I'm missing some information. Could you provide the following details days ?",
|
||||
"model": "Arch-Function-1.5b"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "for next 10 days"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
### prompt gateway request param gathering and function calling (streaming)
|
||||
POST {{prompt_endpoint}}/v1/chat/completions HTTP/1.1
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "how is the weather in seattle"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "It seems I'm missing some information. Could you provide the following details days ?",
|
||||
"model": "Arch-Function-1.5b"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "for next 10 days"
|
||||
}
|
||||
],
|
||||
"stream": true
|
||||
}
|
||||
|
|
@ -11,7 +11,8 @@ use common::http::CallArgs;
|
|||
use common::http::Client;
|
||||
use common::stats::Gauge;
|
||||
use common::stats::IncrementingMetric;
|
||||
use log::debug;
|
||||
use http::StatusCode;
|
||||
use log::{debug, info, trace, warn};
|
||||
use proxy_wasm::traits::*;
|
||||
use proxy_wasm::types::*;
|
||||
use std::cell::RefCell;
|
||||
|
|
@ -53,6 +54,7 @@ pub struct FilterContext {
|
|||
prompt_guards: Rc<PromptGuards>,
|
||||
embeddings_store: Option<Rc<EmbeddingsStore>>,
|
||||
temp_embeddings_store: EmbeddingsStore,
|
||||
active_embedding_calls_count: u32,
|
||||
}
|
||||
|
||||
impl FilterContext {
|
||||
|
|
@ -66,22 +68,26 @@ impl FilterContext {
|
|||
prompt_guards: Rc::new(PromptGuards::default()),
|
||||
embeddings_store: Some(Rc::new(HashMap::new())),
|
||||
temp_embeddings_store: HashMap::new(),
|
||||
active_embedding_calls_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn process_prompt_targets(&self) {
|
||||
for values in self.prompt_targets.iter() {
|
||||
let prompt_target = values.1;
|
||||
self.schedule_embeddings_call(
|
||||
&prompt_target.name,
|
||||
&prompt_target.description,
|
||||
EmbeddingType::Description,
|
||||
);
|
||||
}
|
||||
fn process_prompt_targets(&mut self) {
|
||||
let prompt_target_description: Vec<(String, String)> = self
|
||||
.prompt_targets
|
||||
.iter()
|
||||
.map(|(k, v)| (k.clone(), v.description.clone()))
|
||||
.collect();
|
||||
|
||||
prompt_target_description
|
||||
.iter()
|
||||
.for_each(|(name, description)| {
|
||||
self.schedule_embeddings_call(name, description, EmbeddingType::Description);
|
||||
});
|
||||
}
|
||||
|
||||
fn schedule_embeddings_call(
|
||||
&self,
|
||||
&mut self,
|
||||
prompt_target_name: &str,
|
||||
input: &str,
|
||||
embedding_type: EmbeddingType,
|
||||
|
|
@ -116,6 +122,7 @@ impl FilterContext {
|
|||
embedding_type,
|
||||
};
|
||||
|
||||
self.active_embedding_calls_count += 1;
|
||||
if let Err(error) = self.http_call(call_args, call_context) {
|
||||
panic!("{error}")
|
||||
}
|
||||
|
|
@ -123,9 +130,9 @@ impl FilterContext {
|
|||
|
||||
fn embedding_response_handler(
|
||||
&mut self,
|
||||
body_size: usize,
|
||||
embedding_type: EmbeddingType,
|
||||
prompt_target_name: String,
|
||||
body: Vec<u8>,
|
||||
) {
|
||||
let prompt_target = self
|
||||
.prompt_targets
|
||||
|
|
@ -137,9 +144,6 @@ impl FilterContext {
|
|||
)
|
||||
});
|
||||
|
||||
let body = self
|
||||
.get_http_call_response_body(0, body_size)
|
||||
.expect("No body in response");
|
||||
if !body.is_empty() {
|
||||
let mut embedding_response: CreateEmbeddingResponse =
|
||||
match serde_json::from_slice(&body) {
|
||||
|
|
@ -208,7 +212,7 @@ impl Context for FilterContext {
|
|||
body_size: usize,
|
||||
_num_trailers: usize,
|
||||
) {
|
||||
debug!(
|
||||
trace!(
|
||||
"filter_context: on_http_call_response called with token_id: {:?}",
|
||||
token_id
|
||||
);
|
||||
|
|
@ -218,13 +222,26 @@ impl Context for FilterContext {
|
|||
.remove(&token_id)
|
||||
.expect("invalid token_id");
|
||||
|
||||
self.active_embedding_calls_count -= 1;
|
||||
self.metrics.active_http_calls.increment(-1);
|
||||
let body_bytes = self.get_http_call_response_body(0, body_size).unwrap();
|
||||
|
||||
self.embedding_response_handler(
|
||||
body_size,
|
||||
callout_data.embedding_type,
|
||||
callout_data.prompt_target_name,
|
||||
)
|
||||
if let Some(status_code) = self.get_http_call_response_header(":status") {
|
||||
if status_code == StatusCode::OK.as_str() {
|
||||
self.embedding_response_handler(
|
||||
callout_data.embedding_type,
|
||||
callout_data.prompt_target_name,
|
||||
body_bytes,
|
||||
);
|
||||
} else {
|
||||
warn!(
|
||||
"Received non-200 status code: {} for callout with token_id: {}: body_str: {}",
|
||||
status_code,
|
||||
token_id,
|
||||
String::from_utf8(body_bytes).unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -262,10 +279,7 @@ impl RootContext for FilterContext {
|
|||
context_id
|
||||
);
|
||||
|
||||
let embedding_store = match self.embeddings_store.as_ref() {
|
||||
None => return None,
|
||||
Some(store) => Some(Rc::clone(store)),
|
||||
};
|
||||
let embedding_store = self.embeddings_store.as_ref().map(Rc::clone);
|
||||
Some(Box::new(StreamContext::new(
|
||||
context_id,
|
||||
Rc::clone(&self.metrics),
|
||||
|
|
@ -287,8 +301,20 @@ impl RootContext for FilterContext {
|
|||
}
|
||||
|
||||
fn on_tick(&mut self) {
|
||||
debug!("starting up arch filter in mode: prompt gateway mode");
|
||||
self.process_prompt_targets();
|
||||
self.set_tick_period(Duration::from_secs(0));
|
||||
if self.embeddings_store.is_some()
|
||||
&& self.embeddings_store.as_ref().unwrap().len() == self.prompt_targets.len()
|
||||
{
|
||||
info!("embeddings store initialized");
|
||||
self.set_tick_period(Duration::from_secs(0));
|
||||
} else {
|
||||
if self.active_embedding_calls_count == 0 {
|
||||
info!("retrieving embeddings from embedding server");
|
||||
self.process_prompt_targets();
|
||||
} else {
|
||||
info!("waiting for embeddings store to be initialized");
|
||||
}
|
||||
|
||||
self.set_tick_period(Duration::from_secs(5));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -37,10 +37,10 @@ impl HttpContext for StreamContext {
|
|||
|
||||
let request_path = self.get_http_request_header(":path").unwrap_or_default();
|
||||
if request_path == HEALTHZ_PATH {
|
||||
if self.embeddings_store.is_none() {
|
||||
self.send_http_response(503, vec![], None);
|
||||
} else {
|
||||
if self.is_embedding_store_initialized() {
|
||||
self.send_http_response(200, vec![], None);
|
||||
} else {
|
||||
self.send_http_response(503, vec![], None);
|
||||
}
|
||||
return Action::Continue;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ pub struct StreamCallContext {
|
|||
|
||||
pub struct StreamContext {
|
||||
system_prompt: Rc<Option<String>>,
|
||||
prompt_targets: Rc<HashMap<String, PromptTarget>>,
|
||||
pub prompt_targets: Rc<HashMap<String, PromptTarget>>,
|
||||
pub embeddings_store: Option<Rc<EmbeddingsStore>>,
|
||||
overrides: Rc<Option<Overrides>>,
|
||||
pub metrics: Rc<WasmMetrics>,
|
||||
|
|
@ -111,10 +111,21 @@ impl StreamContext {
|
|||
traceparent: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn embeddings_store(&self) -> &EmbeddingsStore {
|
||||
self.embeddings_store
|
||||
.as_ref()
|
||||
.expect("embeddings store is not set")
|
||||
self.embeddings_store.as_ref().unwrap()
|
||||
}
|
||||
|
||||
pub fn is_embedding_store_initialized(&self) -> bool {
|
||||
if self.embeddings_store.as_ref().is_none() {
|
||||
return false;
|
||||
}
|
||||
|
||||
if self.embeddings_store.as_ref().unwrap().len() == self.prompt_targets.len() {
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
pub fn send_server_error(&self, error: ServerError, override_status_code: Option<StatusCode>) {
|
||||
|
|
@ -232,7 +243,7 @@ impl StreamContext {
|
|||
"embeddings not found for prompt target name: {}",
|
||||
prompt_name
|
||||
);
|
||||
return (prompt_name.clone(), f64::NAN);
|
||||
return (prompt_name.clone(), 0.0);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -243,7 +254,7 @@ impl StreamContext {
|
|||
"description embeddings not found for prompt target name: {}",
|
||||
prompt_name
|
||||
);
|
||||
return (prompt_name.clone(), f64::NAN);
|
||||
return (prompt_name.clone(), 0.0);
|
||||
}
|
||||
};
|
||||
let similarity_score_description =
|
||||
|
|
@ -698,7 +709,7 @@ impl StreamContext {
|
|||
if self.tool_calls.is_none() || self.tool_calls.as_ref().unwrap().is_empty() {
|
||||
// This means that Arch FC did not have enough information to resolve the function call
|
||||
// Arch FC probably responded with a message asking for more information.
|
||||
// Let's send the response back to the user to initalize lightweight dialog for parameter collection
|
||||
// Let's send the response back to the user to initialize lightweight dialog for parameter collection
|
||||
|
||||
//TODO: add resolver name to the response so the client can send the response back to the correct resolver
|
||||
|
||||
|
|
|
|||
|
|
@ -161,6 +161,7 @@ fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) {
|
|||
.expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
|
||||
.returning(Some(&embeddings_response_buffer))
|
||||
.expect_log(Some(LogLevel::Trace), None)
|
||||
.expect_log(Some(LogLevel::Warn), None)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Trace), None)
|
||||
.expect_http_call(
|
||||
|
|
@ -244,7 +245,7 @@ fn setup_filter(module: &mut Tester, config: &str) -> i32 {
|
|||
|
||||
module
|
||||
.call_proxy_on_tick(filter_context)
|
||||
.expect_log(Some(LogLevel::Debug), None)
|
||||
.expect_log(Some(LogLevel::Info), None)
|
||||
.expect_log(Some(LogLevel::Trace), None)
|
||||
.expect_http_call(
|
||||
Some("arch_internal"),
|
||||
|
|
@ -262,7 +263,7 @@ fn setup_filter(module: &mut Tester, config: &str) -> i32 {
|
|||
)
|
||||
.returning(Some(101))
|
||||
.expect_metric_increment("active_http_calls", 1)
|
||||
.expect_set_tick_period_millis(Some(0))
|
||||
.expect_set_tick_period_millis(Some(5000))
|
||||
.execute_and_expect(ReturnType::None)
|
||||
.unwrap();
|
||||
|
||||
|
|
@ -289,7 +290,7 @@ fn setup_filter(module: &mut Tester, config: &str) -> i32 {
|
|||
0,
|
||||
)
|
||||
.expect_log(
|
||||
Some(LogLevel::Debug),
|
||||
Some(LogLevel::Trace),
|
||||
Some(
|
||||
format!(
|
||||
"filter_context: on_http_call_response called with token_id: {:?}",
|
||||
|
|
@ -332,7 +333,7 @@ llm_providers:
|
|||
|
||||
overrides:
|
||||
# confidence threshold for prompt target intent matching
|
||||
prompt_target_intent_matching_threshold: 0.6
|
||||
prompt_target_intent_matching_threshold: 0.0
|
||||
|
||||
system_prompt: |
|
||||
You are a helpful assistant.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue