fix: harden LLM providers, UTF-8 handling, and webhook/batch reliability

- webclaw-llm: add explicit request + connect timeouts to the reqwest
  client in every provider (anthropic, openai, ollama) with a shorter
  timeout on the ollama health check, so a stalled provider fails fast.
- webclaw-llm: fix a panic when truncating a provider error body that
  contains multibyte characters near the 500-char cut (char-safe take).
- webclaw-core: snap the endpoint-scan budget cut to a UTF-8 char
  boundary so oversized scripts with non-ASCII content no longer panic.
- webclaw-core: rewrite js_literal_to_json to copy raw bytes instead of
  `byte as char`, preserving multibyte UTF-8 in SvelteKit string values
  rather than producing Latin-1 mojibake.
- webclaw-cli: have fire_webhook return its JoinHandle and await it at
  the crawl/batch/batch-llm call sites, removing the fixed 500ms sleeps.
- webclaw-mcp: drop the up-front DNS pre-validation loop in batch that
  aborted the whole request on one bad URL; the fetch layer already
  applies the same SSRF guard per URL and reports per-URL errors.
- webclaw-fetch: include the port in the warmup homepage URL so hosts
  on a non-default port are warmed correctly.

Adds regression tests for the UTF-8 endpoint-scan and SvelteKit cases.
This commit is contained in:
Valerio 2026-06-09 21:10:15 +02:00
parent d0d7b835f2
commit 499345046c
9 changed files with 117 additions and 51 deletions

View file

@ -1548,7 +1548,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
// Fire webhook on crawl complete
if let Some(ref webhook_url) = cli.webhook {
let urls: Vec<&str> = result.pages.iter().map(|p| p.url.as_str()).collect();
fire_webhook(
let handle = fire_webhook(
webhook_url,
&serde_json::json!({
"event": "crawl_complete",
@ -1559,8 +1559,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
"urls": urls,
}),
);
// Brief pause so the async webhook has time to fire
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
// Wait for the webhook to finish so the process doesn't exit mid-send.
let _ = handle.await;
}
if result.errors > 0 {
@ -1658,7 +1658,7 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<()
// Fire webhook on batch complete
if let Some(ref webhook_url) = cli.webhook {
let urls: Vec<&str> = results.iter().map(|r| r.url.as_str()).collect();
fire_webhook(
let handle = fire_webhook(
webhook_url,
&serde_json::json!({
"event": "batch_complete",
@ -1668,7 +1668,7 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<()
"urls": urls,
}),
);
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
let _ = handle.await;
}
if errors > 0 {
@ -1742,9 +1742,12 @@ async fn spawn_on_change(cmd: &str, stdin_payload: &[u8]) {
}
}
/// Fire a webhook POST with a JSON payload. Non-blocking — errors logged to stderr.
/// Auto-detects Discord and Slack webhook URLs and wraps the payload accordingly.
fn fire_webhook(url: &str, payload: &serde_json::Value) {
/// Fire a webhook POST with a JSON payload. Spawns the send on a background task
/// and returns its `JoinHandle` so callers that need delivery (e.g. one-shot
/// crawl/batch runs that exit immediately after) can `.await` it; long-running
/// loops can drop the handle and let it run fire-and-forget. Errors are logged
/// to stderr. Auto-detects Discord and Slack webhook URLs and wraps the payload.
fn fire_webhook(url: &str, payload: &serde_json::Value) -> tokio::task::JoinHandle<()> {
let url = url.to_string();
let is_discord = url.contains("discord.com/api/webhooks");
let is_slack = url.contains("hooks.slack.com");
@ -1806,7 +1809,7 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
},
Err(e) => eprintln!("[webhook] client error: {e}"),
}
});
})
}
async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
@ -2318,7 +2321,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)");
if let Some(ref webhook_url) = cli.webhook {
fire_webhook(
let handle = fire_webhook(
webhook_url,
&serde_json::json!({
"event": "batch_llm_complete",
@ -2327,7 +2330,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
"errors": errors,
}),
);
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
let _ = handle.await;
}
if errors > 0 {