fix: harden LLM providers, UTF-8 handling, and webhook/batch reliability

- webclaw-llm: add explicit request + connect timeouts to the reqwest client in every provider (anthropic, openai, ollama) with a shorter timeout on the ollama health check, so a stalled provider fails fast. - webclaw-llm: fix a panic when truncating a provider error body that contains multibyte characters near the 500-char cut (char-safe take). - webclaw-core: snap the endpoint-scan budget cut to a UTF-8 char boundary so oversized scripts with non-ASCII content no longer panic. - webclaw-core: rewrite js_literal_to_json to copy raw bytes instead of `byte as char`, preserving multibyte UTF-8 in SvelteKit string values rather than producing Latin-1 mojibake. - webclaw-cli: have fire_webhook return its JoinHandle and await it at the crawl/batch/batch-llm call sites, removing the fixed 500ms sleeps. - webclaw-mcp: drop the up-front DNS pre-validation loop in batch that aborted the whole request on one bad URL; the fetch layer already applies the same SSRF guard per URL and reports per-URL errors. - webclaw-fetch: include the port in the warmup homepage URL so hosts on a non-default port are warmed correctly. Adds regression tests for the UTF-8 endpoint-scan and SvelteKit cases.
2026-06-30 03:49:37 +02:00 · 2026-06-09 21:10:15 +02:00 · 2026-06-09 21:10:15 +02:00 · 499345046c
commit 499345046c
parent d0d7b835f2
9 changed files with 117 additions and 51 deletions
--- a/crates/webclaw-llm/src/providers/anthropic.rs
+++ b/crates/webclaw-llm/src/providers/anthropic.rs
@ -1,6 +1,8 @@
 /// Anthropic provider — Claude models via api.anthropic.com.
 /// Anthropic's API differs from OpenAI: system message is a top-level param,
 /// not part of the messages array.
+use std::time::Duration;
+
 use async_trait::async_trait;
 use serde_json::json;

@ -35,7 +37,11 @@ impl AnthropicProvider {
        let key = load_api_key(key_override, "ANTHROPIC_API_KEY")?;

        Some(Self {
-            client: reqwest::Client::new(),
+            client: reqwest::Client::builder()
+                .timeout(Duration::from_secs(120))
+                .connect_timeout(Duration::from_secs(10))
+                .build()
+                .unwrap_or_else(|_| reqwest::Client::new()),
            key,
            base_url: base_url
                .or_else(|| std::env::var("ANTHROPIC_BASE_URL").ok())
@ -108,11 +114,7 @@ impl LlmProvider for AnthropicProvider {
        if !resp.status().is_success() {
            let status = resp.status();
            let text = resp.text().await.unwrap_or_default();
-            let safe_text = if text.len() > 500 {
-                &text[..500]
-            } else {
-                &text
-            };
+            let safe_text = text.chars().take(500).collect::<String>();
            return Err(LlmError::ProviderError(format!(
                "anthropic returned {status}: {safe_text}"
            )));
--- a/crates/webclaw-llm/src/providers/ollama.rs
+++ b/crates/webclaw-llm/src/providers/ollama.rs
@ -1,5 +1,7 @@
 /// Ollama provider — talks to a local Ollama instance (default localhost:11434).
 /// First choice in the provider chain: free, private, fast on Apple Silicon.
+use std::time::Duration;
+
 use async_trait::async_trait;
 use serde_json::json;

@ -24,7 +26,11 @@ impl OllamaProvider {
            .unwrap_or_else(|| "qwen3:8b".into());

        Self {
-            client: reqwest::Client::new(),
+            client: reqwest::Client::builder()
+                .timeout(Duration::from_secs(120))
+                .connect_timeout(Duration::from_secs(10))
+                .build()
+                .unwrap_or_else(|_| reqwest::Client::new()),
            base_url,
            default_model,
        }
@ -70,11 +76,7 @@ impl LlmProvider for OllamaProvider {
        if !resp.status().is_success() {
            let status = resp.status();
            let text = resp.text().await.unwrap_or_default();
-            let safe_text = if text.len() > 500 {
-                &text[..500]
-            } else {
-                &text
-            };
+            let safe_text = text.chars().take(500).collect::<String>();
            return Err(LlmError::ProviderError(format!(
                "ollama returned {status}: {safe_text}"
            )));
@ -98,7 +100,8 @@ impl LlmProvider for OllamaProvider {

    async fn is_available(&self) -> bool {
        let url = format!("{}/api/tags", self.base_url);
-        matches!(self.client.get(&url).send().await, Ok(r) if r.status().is_success())
+        let req = self.client.get(&url).timeout(Duration::from_secs(10));
+        matches!(req.send().await, Ok(r) if r.status().is_success())
    }

    fn name(&self) -> &str {
--- a/crates/webclaw-llm/src/providers/openai.rs
+++ b/crates/webclaw-llm/src/providers/openai.rs
@ -1,4 +1,6 @@
 /// OpenAI provider — works with api.openai.com and any OpenAI-compatible endpoint.
+use std::time::Duration;
+
 use async_trait::async_trait;
 use serde_json::json;

@ -69,7 +71,11 @@ impl OpenAiProvider {
        let key = load_api_key(key_override, "OPENAI_API_KEY")?;

        Some(Self {
-            client: reqwest::Client::new(),
+            client: reqwest::Client::builder()
+                .timeout(Duration::from_secs(120))
+                .connect_timeout(Duration::from_secs(10))
+                .build()
+                .unwrap_or_else(|_| reqwest::Client::new()),
            key,
            base_url: base_url
                .or_else(|| std::env::var("OPENAI_BASE_URL").ok())
@ -132,11 +138,7 @@ impl LlmProvider for OpenAiProvider {
        if !resp.status().is_success() {
            let status = resp.status();
            let text = resp.text().await.unwrap_or_default();
-            let safe_text = if text.len() > 500 {
-                &text[..500]
-            } else {
-                &text
-            };
+            let safe_text = text.chars().take(500).collect::<String>();
            return Err(LlmError::ProviderError(format!(
                "openai returned {status}: {safe_text}"
            )));