Add support for streaming and fixes few issues (see description) (#202)

2026-06-26 15:39:40 +02:00 · 2024-10-28 20:05:06 -04:00 · 2024-10-28 20:05:06 -04:00 · 662a840ac5
commit 662a840ac5
parent 29ff8da60f
45 changed files with 2266 additions and 477 deletions
--- a/crates/common/src/common_types.rs
+++ b/crates/common/src/common_types.rs
@ -34,11 +34,16 @@ pub struct SearchPointResult {
 }

 pub mod open_ai {
-    use std::collections::HashMap;
+    use std::{
+        collections::{HashMap, VecDeque},
+        fmt::Display,
+    };

    use serde::{ser::SerializeMap, Deserialize, Serialize};
    use serde_yaml::Value;

+    use crate::consts::{ARCH_FC_MODEL_NAME, ASSISTANT_ROLE};
+
    #[derive(Debug, Clone, Serialize, Deserialize)]
    pub struct ChatCompletionsRequest {
        #[serde(default)]
@ -182,12 +187,16 @@ pub mod open_ai {
    #[derive(Debug, Clone, Serialize, Deserialize)]
    pub struct Message {
        pub role: String,
+
        #[serde(skip_serializing_if = "Option::is_none")]
        pub content: Option<String>,
+
        #[serde(skip_serializing_if = "Option::is_none")]
        pub model: Option<String>,
+
        #[serde(skip_serializing_if = "Option::is_none")]
        pub tool_calls: Option<Vec<ToolCall>>,
+
        #[serde(skip_serializing_if = "Option::is_none")]
        pub tool_call_id: Option<String>,
    }
@ -235,17 +244,116 @@ pub mod open_ai {
        pub metadata: Option<HashMap<String, String>>,
    }

+    impl ChatCompletionsResponse {
+        pub fn new(message: String) -> Self {
+            ChatCompletionsResponse {
+                choices: vec![Choice {
+                    message: Message {
+                        role: ASSISTANT_ROLE.to_string(),
+                        content: Some(message),
+                        model: Some(ARCH_FC_MODEL_NAME.to_string()),
+                        tool_calls: None,
+                        tool_call_id: None,
+                    },
+                    index: 0,
+                    finish_reason: "done".to_string(),
+                }],
+                usage: None,
+                model: ARCH_FC_MODEL_NAME.to_string(),
+                metadata: None,
+            }
+        }
+    }
+
    #[derive(Debug, Clone, Serialize, Deserialize)]
    pub struct Usage {
        pub completion_tokens: usize,
    }

    #[derive(Debug, Clone, Serialize, Deserialize)]
-    pub struct ChatCompletionChunkResponse {
-        pub model: String,
+    pub struct ChatCompletionStreamResponse {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub model: Option<String>,
        pub choices: Vec<ChunkChoice>,
    }

+    impl ChatCompletionStreamResponse {
+        pub fn new(
+            response: Option<String>,
+            role: Option<String>,
+            model: Option<String>,
+            tool_calls: Option<Vec<ToolCall>>,
+        ) -> Self {
+            ChatCompletionStreamResponse {
+                model,
+                choices: vec![ChunkChoice {
+                    delta: Delta {
+                        role,
+                        content: response,
+                        tool_calls,
+                        model: None,
+                        tool_call_id: None,
+                    },
+                    finish_reason: None,
+                }],
+            }
+        }
+    }
+
+    #[derive(Debug, thiserror::Error)]
+    pub enum ChatCompletionChunkResponseError {
+        #[error("failed to deserialize")]
+        Deserialization(#[from] serde_json::Error),
+        #[error("empty content in data chunk")]
+        EmptyContent,
+        #[error("no chunks present")]
+        NoChunks,
+    }
+
+    pub struct ChatCompletionStreamResponseServerEvents {
+        pub events: Vec<ChatCompletionStreamResponse>,
+    }
+
+    impl Display for ChatCompletionStreamResponseServerEvents {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            let tokens_str = self
+                .events
+                .iter()
+                .map(|response_chunk| {
+                    if response_chunk.choices.is_empty() {
+                        return "".to_string();
+                    }
+                    response_chunk.choices[0]
+                        .delta
+                        .content
+                        .clone()
+                        .unwrap_or("".to_string())
+                })
+                .collect::<Vec<String>>()
+                .join("");
+
+            write!(f, "{}", tokens_str)
+        }
+    }
+
+    impl TryFrom<&str> for ChatCompletionStreamResponseServerEvents {
+        type Error = ChatCompletionChunkResponseError;
+
+        fn try_from(value: &str) -> Result<Self, Self::Error> {
+            let response_chunks: VecDeque<ChatCompletionStreamResponse> = value
+                .lines()
+                .filter(|line| line.starts_with("data: "))
+                .map(|line| line.get(6..).unwrap())
+                .filter(|data_chunk| *data_chunk != "[DONE]")
+                .map(serde_json::from_str::<ChatCompletionStreamResponse>)
+                .collect::<Result<VecDeque<ChatCompletionStreamResponse>, _>>()?;
+
+            Ok(ChatCompletionStreamResponseServerEvents {
+                events: response_chunks.into(),
+            })
+        }
+    }
+
    #[derive(Debug, Clone, Serialize, Deserialize)]
    pub struct ChunkChoice {
        pub delta: Delta,
@ -255,7 +363,30 @@ pub mod open_ai {

    #[derive(Debug, Clone, Serialize, Deserialize)]
    pub struct Delta {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub role: Option<String>,
+
+        #[serde(skip_serializing_if = "Option::is_none")]
        pub content: Option<String>,
+
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub tool_calls: Option<Vec<ToolCall>>,
+
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub model: Option<String>,
+
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub tool_call_id: Option<String>,
+    }
+
+    pub fn to_server_events(chunks: Vec<ChatCompletionStreamResponse>) -> String {
+        let mut response_str = String::new();
+        for chunk in chunks.iter() {
+            response_str.push_str("data: ");
+            response_str.push_str(&serde_json::to_string(&chunk).unwrap());
+            response_str.push_str("\n\n");
+        }
+        response_str
    }
 }

@ -313,7 +444,7 @@ pub struct PromptGuardResponse {

 #[cfg(test)]
 mod test {
-    use crate::common_types::open_ai::Message;
+    use crate::common_types::open_ai::{ChatCompletionStreamResponseServerEvents, Message};
    use pretty_assertions::{assert_eq, assert_ne};
    use std::collections::HashMap;

@ -448,4 +579,173 @@ mod test {
            ParameterType::String
        );
    }
+
+    #[test]
+    fn stream_chunk_parse() {
+        use super::open_ai::{ChatCompletionStreamResponse, ChunkChoice, Delta};
+
+        const CHUNK_RESPONSE: &str = r#"data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"role":"assistant","content":"","refusal":null},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":"Hello"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":"!"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" How"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" can"},"logprobs":null,"finish_reason":null}]}
+
+
+"#;
+
+        let sever_events =
+            ChatCompletionStreamResponseServerEvents::try_from(CHUNK_RESPONSE).unwrap();
+        assert_eq!(sever_events.events.len(), 5);
+        assert_eq!(
+            sever_events.events[0].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            ""
+        );
+        assert_eq!(
+            sever_events.events[1].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            "Hello"
+        );
+        assert_eq!(
+            sever_events.events[2].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            "!"
+        );
+        assert_eq!(
+            sever_events.events[3].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " How"
+        );
+        assert_eq!(
+            sever_events.events[4].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " can"
+        );
+        assert_eq!(sever_events.to_string(), "Hello! How can");
+    }
+
+    #[test]
+    fn stream_chunk_parse_done() {
+        use super::open_ai::{ChatCompletionStreamResponse, ChunkChoice, Delta};
+
+        const CHUNK_RESPONSE: &str = r#"data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" I"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" assist"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" you"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" today"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":"?"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}]}
+
+data: [DONE]
+"#;
+
+        let sever_events: ChatCompletionStreamResponseServerEvents =
+            ChatCompletionStreamResponseServerEvents::try_from(CHUNK_RESPONSE).unwrap();
+        assert_eq!(sever_events.events.len(), 6);
+        assert_eq!(
+            sever_events.events[0].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " I"
+        );
+        assert_eq!(
+            sever_events.events[1].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " assist"
+        );
+        assert_eq!(
+            sever_events.events[2].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " you"
+        );
+        assert_eq!(
+            sever_events.events[3].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " today"
+        );
+        assert_eq!(
+            sever_events.events[4].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            "?"
+        );
+        assert_eq!(sever_events.events[5].choices[0].delta.content, None);
+
+        assert_eq!(sever_events.to_string(), " I assist you today?");
+    }
+
+    #[test]
+    fn stream_chunk_parse_mistral() {
+        use super::open_ai::{ChatCompletionStreamResponse, ChunkChoice, Delta};
+
+        const CHUNK_RESPONSE: &str = r#"data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":"Hello"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":"!"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" How"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" can"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" I"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" assist"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" you"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" today"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":"?"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":""},"finish_reason":"stop"}],"usage":{"prompt_tokens":4,"total_tokens":13,"completion_tokens":9}}
+
+data: [DONE]
+"#;
+
+        let sever_events: ChatCompletionStreamResponseServerEvents =
+            ChatCompletionStreamResponseServerEvents::try_from(CHUNK_RESPONSE).unwrap();
+        assert_eq!(sever_events.events.len(), 11);
+
+        assert_eq!(
+            sever_events.to_string(),
+            "Hello! How can I assist you today?"
+        );
+    }
 }
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -27,12 +27,12 @@ pub enum GatewayMode {
 pub struct Configuration {
    pub version: String,
    pub listener: Listener,
-    pub endpoints: HashMap<String, Endpoint>,
+    pub endpoints: Option<HashMap<String, Endpoint>>,
    pub llm_providers: Vec<LlmProvider>,
    pub overrides: Option<Overrides>,
    pub system_prompt: Option<String>,
    pub prompt_guards: Option<PromptGuards>,
-    pub prompt_targets: Vec<PromptTarget>,
+    pub prompt_targets: Option<Vec<PromptTarget>>,
    pub error_target: Option<ErrorTargetDetail>,
    pub ratelimits: Option<Vec<Ratelimit>>,
    pub tracing: Option<Tracing>,
@ -246,8 +246,10 @@ mod test {
        );

        let prompt_targets = &config.prompt_targets;
-        assert_eq!(prompt_targets.len(), 2);
+        assert_eq!(prompt_targets.as_ref().unwrap().len(), 2);
        let prompt_target = prompt_targets
+            .as_ref()
+            .unwrap()
            .iter()
            .find(|p| p.name == "reboot_network_device")
            .unwrap();
@ -255,6 +257,8 @@ mod test {
        assert_eq!(prompt_target.default, None);

        let prompt_target = prompt_targets
+            .as_ref()
+            .unwrap()
            .iter()
            .find(|p| p.name == "information_extraction")
            .unwrap();
--- a/crates/common/src/consts.rs
+++ b/crates/common/src/consts.rs
@ -18,6 +18,7 @@ pub const ARCH_ROUTING_HEADER: &str = "x-arch-llm-provider";
 pub const MESSAGES_KEY: &str = "messages";
 pub const ARCH_PROVIDER_HINT_HEADER: &str = "x-arch-llm-provider-hint";
 pub const CHAT_COMPLETIONS_PATH: &str = "/v1/chat/completions";
+pub const HEALTHZ_PATH: &str = "/healthz";
 pub const ARCH_STATE_HEADER: &str = "x-arch-state";
 pub const ARCH_FC_MODEL_NAME: &str = "Arch-Function-1.5B";
 pub const REQUEST_ID_HEADER: &str = "x-request-id";
@ -25,4 +26,5 @@ pub const ARCH_INTERNAL_CLUSTER_NAME: &str = "arch_internal";
 pub const ARCH_UPSTREAM_HOST_HEADER: &str = "x-arch-upstream";
 pub const ARCH_LLM_UPSTREAM_LISTENER: &str = "arch_llm_listener";
 pub const ARCH_MODEL_PREFIX: &str = "Arch";
-pub const HALLUCINATION_TEMPLATE: &str = "It seems I’m missing some information. Could you provide the following details ";
+pub const HALLUCINATION_TEMPLATE: &str =
+    "It seems I'm missing some information. Could you provide the following details ";
--- a/crates/common/src/errors.rs
+++ b/crates/common/src/errors.rs
@ -1,6 +1,7 @@
 use proxy_wasm::types::Status;
+use serde_json::error;

-use crate::ratelimit;
+use crate::{common_types::open_ai::ChatCompletionChunkResponseError, ratelimit};

 #[derive(thiserror::Error, Debug)]
 pub enum ClientError {
@ -37,4 +38,6 @@ pub enum ServerError {
    ExceededRatelimit(ratelimit::Error),
    #[error("{why}")]
    BadRequest { why: String },
+    #[error("error in streaming response")]
+    Streaming(#[from] ChatCompletionChunkResponseError),
 }
--- a/crates/common/src/tokenizer.rs
+++ b/crates/common/src/tokenizer.rs
@ -1,17 +1,19 @@
 use log::debug;

-#[derive(Debug, PartialEq, Eq)]
+#[derive(thiserror::Error, Debug, PartialEq, Eq)]
 #[allow(dead_code)]
 pub enum Error {
-    UnknownModel,
-    FailedToTokenize,
+    #[error("Unknown model: {model_name}")]
+    UnknownModel { model_name: String },
 }

 #[allow(dead_code)]
 pub fn token_count(model_name: &str, text: &str) -> Result<usize, Error> {
    debug!("getting token count model={}", model_name);
    // Consideration: is it more expensive to instantiate the BPE object every time, or to contend the singleton?
-    let bpe = tiktoken_rs::get_bpe_from_model(model_name).map_err(|_| Error::UnknownModel)?;
+    let bpe = tiktoken_rs::get_bpe_from_model(model_name).map_err(|_| Error::UnknownModel {
+        model_name: model_name.to_string(),
+    })?;
    Ok(bpe.encode_ordinary(text).len())
 }

@ -32,7 +34,9 @@ mod test {
    #[test]
    fn unrecognized_model() {
        assert_eq!(
-            Error::UnknownModel,
+            Error::UnknownModel {
+                model_name: "unknown".to_string()
+            },
            token_count("unknown", "").expect_err("unknown model")
        )
    }