removing model_server python module to brightstaff (function calling) (#615)

* adding function_calling functionality via rust * fixed rendered YAML file * removed model_server from envoy.template and forwarding traffic to bright_staff * fixed bugs in function_calling.rs that were breaking tests. All good now * updating e2e test to clean up disk usage * removing Arch* models to be used as a default model if one is not specified * if the user sets arch-function base_url we should honor it * fixing demos as we needed to pin to a particular version of huggingface_hub else the chatbot ui wouldn't build * adding a constant for Arch-Function model name * fixing some edge cases with calls made to Arch-Function * fixed JSON parsing issues in function_calling.rs * fixed bug where the raw response from Arch-Function was re-encoded * removed debug from supervisord.conf * commenting out disk cleanup * adding back disk space --------- Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-288.local> Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-342.local>
2026-05-08 15:22:43 +02:00 · 2025-11-22 12:55:00 -08:00 · 2025-11-22 12:55:00 -08:00 · 88c2bd1851
commit 88c2bd1851
parent 126b029345
40 changed files with 2517 additions and 1356 deletions
--- a/crates/brightstaff/src/handlers/function_calling.rs
+++ b/crates/brightstaff/src/handlers/function_calling.rs
--- a/crates/brightstaff/src/handlers/mod.rs
+++ b/crates/brightstaff/src/handlers/mod.rs
@ -1,9 +1,11 @@
 pub mod agent_chat_completions;
 pub mod agent_selector;
-pub mod chat_completions;
+pub mod router;
 pub mod models;
+pub mod function_calling;
 pub mod pipeline_processor;
 pub mod response_handler;
+pub mod utils;

 #[cfg(test)]
 mod integration_tests;
--- a/crates/brightstaff/src/handlers/chat_completions.rs
+++ b/crates/brightstaff/src/handlers/chat_completions.rs
@ -6,18 +6,15 @@ use hermesllm::clients::endpoints::SupportedUpstreamAPIs;
 use hermesllm::clients::SupportedAPIs;
 use hermesllm::{ProviderRequest, ProviderRequestType};
 use http_body_util::combinators::BoxBody;
-use http_body_util::{BodyExt, Full, StreamBody};
-use hyper::body::Frame;
+use http_body_util::{BodyExt, Full};
 use hyper::header::{self};
 use hyper::{Request, Response, StatusCode};
 use std::collections::HashMap;
 use std::sync::Arc;
-use tokio::sync::mpsc;
-use tokio_stream::wrappers::ReceiverStream;
-use tokio_stream::StreamExt;
 use tracing::{debug, info, warn};

 use crate::router::llm_router::RouterService;
+use crate::handlers::utils::{create_streaming_response, PassthroughProcessor};

 fn full<T: Into<Bytes>>(chunk: T) -> BoxBody<Bytes, hyper::Error> {
    Full::new(chunk.into())
@ -25,7 +22,7 @@ fn full<T: Into<Bytes>>(chunk: T) -> BoxBody<Bytes, hyper::Error> {
        .boxed()
 }

-pub async fn chat(
+pub async fn router_chat(
    request: Request<hyper::body::Incoming>,
    router_service: Arc<RouterService>,
    full_qualified_llm_provider_url: String,
@ -237,34 +234,12 @@ pub async fn chat(
        headers.insert(header_name, header_value.clone());
    }

-    // channel to create async stream
-    let (tx, rx) = mpsc::channel::<Bytes>(16);
+    // Use the streaming utility with a passthrough processor (no modification of chunks)
+    let byte_stream = llm_response.bytes_stream();
+    let processor = PassthroughProcessor;
+    let streaming_response = create_streaming_response(byte_stream, processor, 16);

-    // Spawn a task to send data as it becomes available
-    tokio::spawn(async move {
-        let mut byte_stream = llm_response.bytes_stream();
-
-        while let Some(item) = byte_stream.next().await {
-            let item = match item {
-                Ok(item) => item,
-                Err(err) => {
-                    warn!("Error receiving chunk: {:?}", err);
-                    break;
-                }
-            };
-
-            if tx.send(item).await.is_err() {
-                warn!("Receiver dropped");
-                break;
-            }
-        }
-    });
-
-    let stream = ReceiverStream::new(rx).map(|chunk| Ok::<_, hyper::Error>(Frame::data(chunk)));
-
-    let stream_body = BoxBody::new(StreamBody::new(stream));
-
-    match response.body(stream_body) {
+    match response.body(streaming_response.body) {
        Ok(response) => Ok(response),
        Err(err) => {
            let err_msg = format!("Failed to create response: {}", err);
--- a/crates/brightstaff/src/handlers/utils.rs
+++ b/crates/brightstaff/src/handlers/utils.rs
@ -0,0 +1,93 @@
+use bytes::Bytes;
+use http_body_util::combinators::BoxBody;
+use http_body_util::StreamBody;
+use hyper::body::Frame;
+use tokio::sync::mpsc;
+use tokio_stream::wrappers::ReceiverStream;
+use tokio_stream::StreamExt;
+use tracing::warn;
+
+/// Trait for processing streaming chunks
+/// Implementors can inject custom logic during streaming (e.g., hallucination detection, logging)
+pub trait StreamProcessor: Send + 'static {
+    /// Process an incoming chunk of bytes
+    fn process_chunk(&mut self, chunk: Bytes) -> Result<Option<Bytes>, String>;
+
+    /// Called when streaming completes successfully
+    fn on_complete(&mut self) {}
+
+    /// Called when streaming encounters an error
+    fn on_error(&mut self, _error: &str) {}
+}
+
+/// A no-op processor that just forwards chunks as-is
+pub struct PassthroughProcessor;
+
+impl StreamProcessor for PassthroughProcessor {
+    fn process_chunk(&mut self, chunk: Bytes) -> Result<Option<Bytes>, String> {
+        Ok(Some(chunk))
+    }
+}
+
+/// Result of creating a streaming response
+pub struct StreamingResponse {
+    pub body: BoxBody<Bytes, hyper::Error>,
+    pub processor_handle: tokio::task::JoinHandle<()>,
+}
+
+pub fn create_streaming_response<S, P>(
+    mut byte_stream: S,
+    mut processor: P,
+    buffer_size: usize,
+) -> StreamingResponse
+where
+    S: StreamExt<Item = Result<Bytes, reqwest::Error>> + Send + Unpin + 'static,
+    P: StreamProcessor,
+{
+    let (tx, rx) = mpsc::channel::<Bytes>(buffer_size);
+
+    // Spawn a task to process and forward chunks
+    let processor_handle = tokio::spawn(async move {
+        while let Some(item) = byte_stream.next().await {
+            let chunk = match item {
+                Ok(chunk) => chunk,
+                Err(err) => {
+                    let err_msg = format!("Error receiving chunk: {:?}", err);
+                    warn!("{}", err_msg);
+                    processor.on_error(&err_msg);
+                    break;
+                }
+            };
+
+            // Process the chunk
+            match processor.process_chunk(chunk) {
+                Ok(Some(processed_chunk)) => {
+                    if tx.send(processed_chunk).await.is_err() {
+                        warn!("Receiver dropped");
+                        break;
+                    }
+                }
+                Ok(None) => {
+                    // Skip this chunk
+                    continue;
+                }
+                Err(err) => {
+                    warn!("Processor error: {}", err);
+                    processor.on_error(&err);
+                    break;
+                }
+            }
+        }
+
+        processor.on_complete();
+    });
+
+    // Convert channel receiver to HTTP stream
+    let stream = ReceiverStream::new(rx).map(|chunk| Ok::<_, hyper::Error>(Frame::data(chunk)));
+    let stream_body = BoxBody::new(StreamBody::new(stream));
+
+    StreamingResponse {
+        body: stream_body,
+        processor_handle,
+    }
+}