fix: harden LLM providers, UTF-8 handling, and webhook/batch reliability

- webclaw-llm: add explicit request + connect timeouts to the reqwest client in every provider (anthropic, openai, ollama) with a shorter timeout on the ollama health check, so a stalled provider fails fast. - webclaw-llm: fix a panic when truncating a provider error body that contains multibyte characters near the 500-char cut (char-safe take). - webclaw-core: snap the endpoint-scan budget cut to a UTF-8 char boundary so oversized scripts with non-ASCII content no longer panic. - webclaw-core: rewrite js_literal_to_json to copy raw bytes instead of `byte as char`, preserving multibyte UTF-8 in SvelteKit string values rather than producing Latin-1 mojibake. - webclaw-cli: have fire_webhook return its JoinHandle and await it at the crawl/batch/batch-llm call sites, removing the fixed 500ms sleeps. - webclaw-mcp: drop the up-front DNS pre-validation loop in batch that aborted the whole request on one bad URL; the fetch layer already applies the same SSRF guard per URL and reports per-URL errors. - webclaw-fetch: include the port in the warmup homepage URL so hosts on a non-default port are warmed correctly. Adds regression tests for the UTF-8 endpoint-scan and SvelteKit cases.
2026-06-11 22:55:13 +02:00 · 2026-06-09 21:10:15 +02:00 · 2026-06-09 21:10:15 +02:00 · 499345046c
commit 499345046c
parent d0d7b835f2
9 changed files with 117 additions and 51 deletions
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -1548,7 +1548,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
    // Fire webhook on crawl complete
    if let Some(ref webhook_url) = cli.webhook {
        let urls: Vec<&str> = result.pages.iter().map(|p| p.url.as_str()).collect();
-        fire_webhook(
+        let handle = fire_webhook(
            webhook_url,
            &serde_json::json!({
                "event": "crawl_complete",
@ -1559,8 +1559,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
                "urls": urls,
            }),
        );
-        // Brief pause so the async webhook has time to fire
-        tokio::time::sleep(std::time::Duration::from_millis(500)).await;
+        // Wait for the webhook to finish so the process doesn't exit mid-send.
+        let _ = handle.await;
    }

    if result.errors > 0 {
@ -1658,7 +1658,7 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<()
    // Fire webhook on batch complete
    if let Some(ref webhook_url) = cli.webhook {
        let urls: Vec<&str> = results.iter().map(|r| r.url.as_str()).collect();
-        fire_webhook(
+        let handle = fire_webhook(
            webhook_url,
            &serde_json::json!({
                "event": "batch_complete",
@ -1668,7 +1668,7 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<()
                "urls": urls,
            }),
        );
-        tokio::time::sleep(std::time::Duration::from_millis(500)).await;
+        let _ = handle.await;
    }

    if errors > 0 {
@ -1742,9 +1742,12 @@ async fn spawn_on_change(cmd: &str, stdin_payload: &[u8]) {
    }
 }

-/// Fire a webhook POST with a JSON payload. Non-blocking — errors logged to stderr.
-/// Auto-detects Discord and Slack webhook URLs and wraps the payload accordingly.
-fn fire_webhook(url: &str, payload: &serde_json::Value) {
+/// Fire a webhook POST with a JSON payload. Spawns the send on a background task
+/// and returns its `JoinHandle` so callers that need delivery (e.g. one-shot
+/// crawl/batch runs that exit immediately after) can `.await` it; long-running
+/// loops can drop the handle and let it run fire-and-forget. Errors are logged
+/// to stderr. Auto-detects Discord and Slack webhook URLs and wraps the payload.
+fn fire_webhook(url: &str, payload: &serde_json::Value) -> tokio::task::JoinHandle<()> {
    let url = url.to_string();
    let is_discord = url.contains("discord.com/api/webhooks");
    let is_slack = url.contains("hooks.slack.com");
@ -1806,7 +1809,7 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
            },
            Err(e) => eprintln!("[webhook] client error: {e}"),
        }
-    });
+    })
 }

 async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
@ -2318,7 +2321,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
    eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)");

    if let Some(ref webhook_url) = cli.webhook {
-        fire_webhook(
+        let handle = fire_webhook(
            webhook_url,
            &serde_json::json!({
                "event": "batch_llm_complete",
@ -2327,7 +2330,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
                "errors": errors,
            }),
        );
-        tokio::time::sleep(std::time::Duration::from_millis(500)).await;
+        let _ = handle.await;
    }

    if errors > 0 {