add jemalloc and /debug/memstats endpoint for OOM diagnosis (#885)

2026-04-25 00:36:34 +02:00 · 2026-04-23 13:59:12 -07:00 · 2026-04-23 13:59:12 -07:00 · aa726b1bba
commit aa726b1bba
parent c8079ac971
7 changed files with 371 additions and 0 deletions
--- a/crates/Cargo.lock
+++ b/crates/Cargo.lock
@ -366,6 +366,8 @@ dependencies = [
 "serde_yaml",
 "strsim",
 "thiserror 2.0.18",
 "tikv-jemalloc-ctl",
 "tikv-jemallocator",
 "time",
 "tokio",
 "tokio-postgres",
@ -2323,6 +2325,12 @@ dependencies = [
 "windows-link",
 ]
 [[package]]
 name = "paste"
 version = "1.0.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
 [[package]]
 name = "percent-encoding"
 version = "2.3.2"
@ -3566,6 +3574,37 @@ dependencies = [
 "rustc-hash 1.1.0",
 ]
 [[package]]
 name = "tikv-jemalloc-ctl"
 version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "661f1f6a57b3a36dc9174a2c10f19513b4866816e13425d3e418b11cc37bc24c"
 dependencies = [
 "libc",
 "paste",
 "tikv-jemalloc-sys",
 ]
 [[package]]
 name = "tikv-jemalloc-sys"
 version = "0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cd8aa5b2ab86a2cefa406d889139c162cbb230092f7d1d7cbc1716405d852a3b"
 dependencies = [
 "cc",
 "libc",
 ]
 [[package]]
 name = "tikv-jemallocator"
 version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0359b4327f954e0567e69fb191cf1436617748813819c94b8cd4a431422d053a"
 dependencies = [
 "libc",
 "tikv-jemalloc-sys",
 ]
 [[package]]
 name = "time"
 version = "0.3.47"
--- a/crates/brightstaff/Cargo.toml
+++ b/crates/brightstaff/Cargo.toml
@ -3,6 +3,10 @@ name = "brightstaff"
 version = "0.1.0"
 edition = "2021"
 [features]
 default = ["jemalloc"]
 jemalloc = ["tikv-jemallocator", "tikv-jemalloc-ctl"]
 [[bin]]
 name = "brightstaff"
 path = "src/main.rs"
@ -47,6 +51,8 @@ serde_with = "3.13.0"
 strsim = "0.11"
 serde_yaml = "0.9.34"
 thiserror = "2.0.12"
 tikv-jemallocator = { version = "0.6", optional = true }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"], optional = true }
 tokio = { version = "1.44.2", features = ["full"] }
 tokio-postgres = { version = "0.7", features = ["with-serde_json-1"] }
 tokio-stream = "0.1"
--- a/crates/brightstaff/src/handlers/debug.rs
+++ b/crates/brightstaff/src/handlers/debug.rs
@ -0,0 +1,53 @@
 use bytes::Bytes;
 use http_body_util::combinators::BoxBody;
 use hyper::{Response, StatusCode};
 use super::full;
 #[derive(serde::Serialize)]
 struct MemStats {
    allocated_bytes: usize,
    resident_bytes: usize,
    #[serde(skip_serializing_if = "Option::is_none")]
    error: Option<String>,
 }
 /// Returns jemalloc memory statistics as JSON.
 /// Falls back to a stub when the jemalloc feature is disabled.
 pub async fn memstats() -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
    let stats = get_jemalloc_stats();
    let json = serde_json::to_string(&stats).unwrap();
    Ok(Response::builder()
        .status(StatusCode::OK)
        .header("Content-Type", "application/json")
        .body(full(json))
        .unwrap())
 }
 #[cfg(feature = "jemalloc")]
 fn get_jemalloc_stats() -> MemStats {
    use tikv_jemalloc_ctl::{epoch, stats};
    if let Err(e) = epoch::advance() {
        return MemStats {
            allocated_bytes: 0,
            resident_bytes: 0,
            error: Some(format!("failed to advance jemalloc epoch: {e}")),
        };
    }
    MemStats {
        allocated_bytes: stats::allocated::read().unwrap_or(0),
        resident_bytes: stats::resident::read().unwrap_or(0),
        error: None,
    }
 }
 #[cfg(not(feature = "jemalloc"))]
 fn get_jemalloc_stats() -> MemStats {
    MemStats {
        allocated_bytes: 0,
        resident_bytes: 0,
        error: Some("jemalloc feature not enabled".to_string()),
    }
 }
--- a/crates/brightstaff/src/handlers/mod.rs
+++ b/crates/brightstaff/src/handlers/mod.rs
@ -1,4 +1,5 @@
 pub mod agents;
 pub mod debug;
 pub mod function_calling;
 pub mod llm;
 pub mod models;
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@ -1,5 +1,10 @@
 #[cfg(feature = "jemalloc")]
 #[global_allocator]
 static ALLOC: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 use brightstaff::app_state::AppState;
 use brightstaff::handlers::agents::orchestrator::agent_chat;
 use brightstaff::handlers::debug;
 use brightstaff::handlers::empty;
 use brightstaff::handlers::function_calling::function_calling_chat_handler;
 use brightstaff::handlers::llm::llm_chat;
@ -513,6 +518,7 @@ async fn dispatch(
            Ok(list_models(Arc::clone(&state.llm_providers)).await)
        }
        (&Method::OPTIONS, "/v1/models" | "/agents/v1/models") => cors_preflight(),
        (&Method::GET, "/debug/memstats") => debug::memstats().await,
        _ => {
            debug!(method = %req.method(), path = %path, "no route found");
            let mut not_found = Response::new(empty());
--- a/crates/brightstaff/src/router/mod.rs
+++ b/crates/brightstaff/src/router/mod.rs
@ -3,3 +3,5 @@ pub mod model_metrics;
 pub mod orchestrator;
 pub mod orchestrator_model;
 pub mod orchestrator_model_v1;
 #[cfg(test)]
 mod stress_tests;
--- a/crates/brightstaff/src/router/stress_tests.rs
+++ b/crates/brightstaff/src/router/stress_tests.rs
@ -0,0 +1,264 @@
 #[cfg(test)]
 mod tests {
    use crate::router::orchestrator::OrchestratorService;
    use crate::session_cache::memory::MemorySessionCache;
    use common::configuration::{SelectionPolicy, SelectionPreference, TopLevelRoutingPreference};
    use hermesllm::apis::openai::{Message, MessageContent, Role};
    use std::sync::Arc;
    fn make_messages(n: usize) -> Vec<Message> {
        (0..n)
            .map(|i| Message {
                role: if i % 2 == 0 {
                    Role::User
                } else {
                    Role::Assistant
                },
                content: Some(MessageContent::Text(format!(
                    "This is message number {i} with some padding text to make it realistic."
                ))),
                name: None,
                tool_calls: None,
                tool_call_id: None,
            })
            .collect()
    }
    fn make_routing_prefs() -> Vec<TopLevelRoutingPreference> {
        vec![
            TopLevelRoutingPreference {
                name: "code_generation".to_string(),
                description: "Code generation and debugging tasks".to_string(),
                models: vec![
                    "openai/gpt-4o".to_string(),
                    "openai/gpt-4o-mini".to_string(),
                ],
                selection_policy: SelectionPolicy {
                    prefer: SelectionPreference::None,
                },
            },
            TopLevelRoutingPreference {
                name: "summarization".to_string(),
                description: "Summarizing documents and text".to_string(),
                models: vec![
                    "anthropic/claude-3-sonnet".to_string(),
                    "openai/gpt-4o-mini".to_string(),
                ],
                selection_policy: SelectionPolicy {
                    prefer: SelectionPreference::None,
                },
            },
        ]
    }
    /// Stress test: exercise the full routing code path N times using a mock
    /// HTTP server and measure jemalloc allocated bytes before/after.
    ///
    /// This catches:
    /// - Memory leaks in generate_request / parse_response
    /// - Leaks in reqwest connection handling
    /// - String accumulation in the orchestrator model
    /// - Fragmentation (jemalloc allocated vs resident)
    #[tokio::test]
    async fn stress_test_routing_determine_route() {
        let mut server = mockito::Server::new_async().await;
        let router_url = format!("{}/v1/chat/completions", server.url());
        let mock_response = serde_json::json!({
            "id": "chatcmpl-mock",
            "object": "chat.completion",
            "created": 1234567890,
            "model": "plano-orchestrator",
            "choices": [{
                "index": 0,
                "message": {
                    "role": "assistant",
                    "content": "{\"route\": \"code_generation\"}"
                },
                "finish_reason": "stop"
            }],
            "usage": {"prompt_tokens": 100, "completion_tokens": 10, "total_tokens": 110}
        });
        let _mock = server
            .mock("POST", "/v1/chat/completions")
            .with_status(200)
            .with_header("content-type", "application/json")
            .with_body(mock_response.to_string())
            .expect_at_least(1)
            .create_async()
            .await;
        let prefs = make_routing_prefs();
        let session_cache = Arc::new(MemorySessionCache::new(1000));
        let orchestrator_service = Arc::new(OrchestratorService::with_routing(
            router_url,
            "Plano-Orchestrator".to_string(),
            "plano-orchestrator".to_string(),
            Some(prefs.clone()),
            None,
            None,
            session_cache,
            None,
            2048,
        ));
        // Warm up: a few requests to stabilize allocator state
        for _ in 0..10 {
            let msgs = make_messages(5);
            let _ = orchestrator_service
                .determine_route(&msgs, None, "warmup")
                .await;
        }
        // Snapshot memory after warmup
        let baseline = get_allocated();
        let num_iterations = 2000;
        for i in 0..num_iterations {
            let msgs = make_messages(5 + (i % 10));
            let inline = if i % 3 == 0 {
                Some(make_routing_prefs())
            } else {
                None
            };
            let _ = orchestrator_service
                .determine_route(&msgs, inline, &format!("req-{i}"))
                .await;
        }
        let after = get_allocated();
        let growth = after.saturating_sub(baseline);
        let growth_mb = growth as f64 / (1024.0 * 1024.0);
        let per_request = if num_iterations > 0 {
            growth / num_iterations
        } else {
            0
        };
        eprintln!("=== Routing Stress Test Results ===");
        eprintln!("  Iterations:      {num_iterations}");
        eprintln!("  Baseline alloc:  {} bytes", baseline);
        eprintln!("  Final alloc:     {} bytes", after);
        eprintln!("  Growth:          {} bytes ({growth_mb:.2} MB)", growth);
        eprintln!("  Per-request:     {} bytes", per_request);
        // Allow up to 256 bytes per request of retained growth (connection pool, etc.)
        // A true leak would show thousands of bytes per request.
        assert!(
            per_request < 256,
            "Possible memory leak: {per_request} bytes/request retained after {num_iterations} iterations"
        );
    }
    /// Stress test with high concurrency: many parallel determine_route calls.
    #[tokio::test]
    async fn stress_test_routing_concurrent() {
        let mut server = mockito::Server::new_async().await;
        let router_url = format!("{}/v1/chat/completions", server.url());
        let mock_response = serde_json::json!({
            "id": "chatcmpl-mock",
            "object": "chat.completion",
            "created": 1234567890,
            "model": "plano-orchestrator",
            "choices": [{
                "index": 0,
                "message": {
                    "role": "assistant",
                    "content": "{\"route\": \"summarization\"}"
                },
                "finish_reason": "stop"
            }],
            "usage": {"prompt_tokens": 100, "completion_tokens": 10, "total_tokens": 110}
        });
        let _mock = server
            .mock("POST", "/v1/chat/completions")
            .with_status(200)
            .with_header("content-type", "application/json")
            .with_body(mock_response.to_string())
            .expect_at_least(1)
            .create_async()
            .await;
        let prefs = make_routing_prefs();
        let session_cache = Arc::new(MemorySessionCache::new(1000));
        let orchestrator_service = Arc::new(OrchestratorService::with_routing(
            router_url,
            "Plano-Orchestrator".to_string(),
            "plano-orchestrator".to_string(),
            Some(prefs),
            None,
            None,
            session_cache,
            None,
            2048,
        ));
        // Warm up
        for _ in 0..20 {
            let msgs = make_messages(3);
            let _ = orchestrator_service
                .determine_route(&msgs, None, "warmup")
                .await;
        }
        let baseline = get_allocated();
        let concurrency = 50;
        let requests_per_task = 100;
        let total = concurrency * requests_per_task;
        let mut handles = vec![];
        for t in 0..concurrency {
            let svc = Arc::clone(&orchestrator_service);
            let handle = tokio::spawn(async move {
                for r in 0..requests_per_task {
                    let msgs = make_messages(3 + (r % 8));
                    let _ = svc
                        .determine_route(&msgs, None, &format!("req-{t}-{r}"))
                        .await;
                }
            });
            handles.push(handle);
        }
        for h in handles {
            h.await.unwrap();
        }
        let after = get_allocated();
        let growth = after.saturating_sub(baseline);
        let per_request = growth / total;
        eprintln!("=== Concurrent Routing Stress Test Results ===");
        eprintln!("  Tasks:       {concurrency} x {requests_per_task} = {total}");
        eprintln!("  Baseline:    {} bytes", baseline);
        eprintln!("  Final:       {} bytes", after);
        eprintln!(
            "  Growth:      {} bytes ({:.2} MB)",
            growth,
            growth as f64 / 1_048_576.0
        );
        eprintln!("  Per-request: {} bytes", per_request);
        assert!(
            per_request < 512,
            "Possible memory leak under concurrency: {per_request} bytes/request retained after {total} requests"
        );
    }
    #[cfg(feature = "jemalloc")]
    fn get_allocated() -> usize {
        tikv_jemalloc_ctl::epoch::advance().unwrap();
        tikv_jemalloc_ctl::stats::allocated::read().unwrap_or(0)
    }
    #[cfg(not(feature = "jemalloc"))]
    fn get_allocated() -> usize {
        0
    }
 }