diff --git a/CHANGELOG.md b/CHANGELOG.md index 97aac8b..d7e51d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,18 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [Unreleased] + +### Fixed +- Pages with multibyte text (accented or CJK characters) no longer panic or get mangled during extraction. API-endpoint discovery now cuts oversized scripts on a character boundary instead of crashing mid-character, and structured-data parsing preserves non-ASCII string values instead of turning them into mojibake. +- LLM error messages from a provider no longer panic when the error body contains multibyte characters near the truncation point. +- LLM provider requests now have explicit connect and overall timeouts, so a stalled or unreachable provider fails fast instead of hanging. +- Batch extraction in the MCP server no longer aborts the whole batch when a single URL fails to resolve; bad URLs are reported as individual per-URL errors and the rest still run. +- CLI crawl and batch runs now wait for the completion webhook to actually send before exiting, replacing a fixed delay that could cut the request off or waste time. +- Homepage warm-up requests now include the port for hosts on a non-default port, so those sites are warmed correctly. + +--- + ## [0.6.7] — 2026-06-09 ### Changed diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index 37a04ff..7d82f73 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -1548,7 +1548,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> { // Fire webhook on crawl complete if let Some(ref webhook_url) = cli.webhook { let urls: Vec<&str> = result.pages.iter().map(|p| p.url.as_str()).collect(); - fire_webhook( + let handle = fire_webhook( webhook_url, &serde_json::json!({ "event": "crawl_complete", @@ -1559,8 +1559,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> { "urls": urls, }), ); - // Brief pause so the async webhook has time to fire - tokio::time::sleep(std::time::Duration::from_millis(500)).await; + // Wait for the webhook to finish so the process doesn't exit mid-send. + let _ = handle.await; } if result.errors > 0 { @@ -1658,7 +1658,7 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option)]) -> Result<() // Fire webhook on batch complete if let Some(ref webhook_url) = cli.webhook { let urls: Vec<&str> = results.iter().map(|r| r.url.as_str()).collect(); - fire_webhook( + let handle = fire_webhook( webhook_url, &serde_json::json!({ "event": "batch_complete", @@ -1668,7 +1668,7 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option)]) -> Result<() "urls": urls, }), ); - tokio::time::sleep(std::time::Duration::from_millis(500)).await; + let _ = handle.await; } if errors > 0 { @@ -1742,9 +1742,12 @@ async fn spawn_on_change(cmd: &str, stdin_payload: &[u8]) { } } -/// Fire a webhook POST with a JSON payload. Non-blocking — errors logged to stderr. -/// Auto-detects Discord and Slack webhook URLs and wraps the payload accordingly. -fn fire_webhook(url: &str, payload: &serde_json::Value) { +/// Fire a webhook POST with a JSON payload. Spawns the send on a background task +/// and returns its `JoinHandle` so callers that need delivery (e.g. one-shot +/// crawl/batch runs that exit immediately after) can `.await` it; long-running +/// loops can drop the handle and let it run fire-and-forget. Errors are logged +/// to stderr. Auto-detects Discord and Slack webhook URLs and wraps the payload. +fn fire_webhook(url: &str, payload: &serde_json::Value) -> tokio::task::JoinHandle<()> { let url = url.to_string(); let is_discord = url.contains("discord.com/api/webhooks"); let is_slack = url.contains("hooks.slack.com"); @@ -1806,7 +1809,7 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) { }, Err(e) => eprintln!("[webhook] client error: {e}"), } - }); + }) } async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> { @@ -2318,7 +2321,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option)]) -> Resul eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)"); if let Some(ref webhook_url) = cli.webhook { - fire_webhook( + let handle = fire_webhook( webhook_url, &serde_json::json!({ "event": "batch_llm_complete", @@ -2327,7 +2330,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option)]) -> Resul "errors": errors, }), ); - tokio::time::sleep(std::time::Duration::from_millis(500)).await; + let _ = handle.await; } if errors > 0 { diff --git a/crates/webclaw-core/src/endpoints.rs b/crates/webclaw-core/src/endpoints.rs index 21c5280..d87755c 100644 --- a/crates/webclaw-core/src/endpoints.rs +++ b/crates/webclaw-core/src/endpoints.rs @@ -233,7 +233,13 @@ pub fn extract_endpoints( } let slice = if text.len() > *budget { *truncated = true; - &text[..*budget] + // Snap the cut to a UTF-8 char boundary so non-ASCII content + // (multibyte codepoints straddling the budget) can't panic. + let mut cut = (*budget).min(text.len()); + while cut > 0 && !text.is_char_boundary(cut) { + cut -= 1; + } + &text[..cut] } else { text }; @@ -512,4 +518,16 @@ mod tests { ); assert!(r.hosts.iter().any(|h| h == "pubapi.ticketmaster.co.uk")); } + + #[test] + fn scan_truncation_at_non_ascii_boundary_does_not_panic() { + // A bundle just over the scan budget, padded with a multibyte char + // ('é' is 2 bytes) so the cut lands mid-codepoint. The old + // `&text[..budget]` slice panicked here; the boundary snap must not. + let pad = "é".repeat(MAX_SCAN_BYTES); // ~2× budget in bytes + let bundle = format!("{pad} fetch(\"/api/x\")"); + let bundles = vec![("big.js".to_string(), bundle)]; + let r = extract_endpoints("", "https://example.com/", &bundles); + assert!(r.truncated, "oversized bundle should mark truncated"); + } } diff --git a/crates/webclaw-core/src/structured_data.rs b/crates/webclaw-core/src/structured_data.rs index cd1ae1b..d40465f 100644 --- a/crates/webclaw-core/src/structured_data.rs +++ b/crates/webclaw-core/src/structured_data.rs @@ -178,7 +178,12 @@ pub fn extract_sveltekit(html: &str) -> Vec { /// Preserves already-quoted keys and string values. fn js_literal_to_json(input: &str) -> String { let bytes = input.as_bytes(); - let mut out = String::with_capacity(input.len() + input.len() / 10); + // Accumulate raw bytes, not `byte as char`. The input is valid UTF-8 and we + // only ever copy its bytes verbatim or insert ASCII quotes, so the result is + // guaranteed valid UTF-8 — copying byte-by-byte preserves multibyte + // codepoints (e.g. accented/CJK string values) instead of mangling them + // into Latin-1 mojibake. + let mut out: Vec = Vec::with_capacity(input.len() + input.len() / 10); let mut i = 0; let len = bytes.len(); @@ -187,14 +192,14 @@ fn js_literal_to_json(input: &str) -> String { // Skip through strings if b == b'"' { - out.push('"'); + out.push(b'"'); i += 1; while i < len { let c = bytes[i]; - out.push(c as char); + out.push(c); i += 1; if c == b'\\' && i < len { - out.push(bytes[i] as char); + out.push(bytes[i]); i += 1; } else if c == b'"' { break; @@ -205,11 +210,11 @@ fn js_literal_to_json(input: &str) -> String { // After { or , — look for unquoted key followed by : if (b == b'{' || b == b',' || b == b'[') && i + 1 < len { - out.push(b as char); + out.push(b); i += 1; // Skip whitespace while i < len && bytes[i].is_ascii_whitespace() { - out.push(bytes[i] as char); + out.push(bytes[i]); i += 1; } // Check if next is an unquoted identifier (key) @@ -218,29 +223,30 @@ fn js_literal_to_json(input: &str) -> String { while i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') { i += 1; } - let key = &input[key_start..i]; + let key = &bytes[key_start..i]; // Skip whitespace after key while i < len && bytes[i].is_ascii_whitespace() { i += 1; } // If followed by :, it's an unquoted key — quote it if i < len && bytes[i] == b':' { - out.push('"'); - out.push_str(key); - out.push('"'); + out.push(b'"'); + out.extend_from_slice(key); + out.push(b'"'); } else { // Not a key — might be a bare value like true/false/null - out.push_str(key); + out.extend_from_slice(key); } } continue; } - out.push(b as char); + out.push(b); i += 1; } - out + // Safe: we only copied bytes from valid-UTF-8 `input` plus ASCII quotes. + String::from_utf8(out).unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned()) } /// Replace raw newlines/tabs inside JSON string values with escape sequences. @@ -440,4 +446,17 @@ newline"}"#; assert_eq!(parsed["text"], "line1\nline2"); assert_eq!(parsed["raw"], "has\nnewline"); } + + #[test] + fn js_literal_to_json_preserves_multibyte_utf8() { + // Unquoted ASCII keys with accented and CJK string values (the shape + // SvelteKit emits). The old `byte as char` path turned the multibyte + // values into Latin-1 mojibake; they must now survive intact. + let input = r#"{name:"déjà vu", city:"東京", emoji:"🌱"}"#; + let json = js_literal_to_json(input); + let parsed: Value = serde_json::from_str(&json).unwrap(); + assert_eq!(parsed["name"], "déjà vu"); + assert_eq!(parsed["city"], "東京"); + assert_eq!(parsed["emoji"], "🌱"); + } } diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index 0724cec..7553bb5 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -801,11 +801,17 @@ fn is_challenge_html(html: &str) -> bool { false } -/// Extract the homepage URL (scheme + host) from a full URL. +/// Extract the homepage URL (scheme + host[:port]) from a full URL. fn extract_homepage(url: &str) -> Option { - url::Url::parse(url) - .ok() - .map(|u| format!("{}://{}/", u.scheme(), u.host_str().unwrap_or(""))) + url::Url::parse(url).ok().map(|u| { + let host = u.host_str().unwrap_or(""); + // `port()` is `Some` only for a non-default port; include it so a + // host like example.com:8443 is warmed on the right port. + match u.port() { + Some(port) => format!("{}://{}:{}/", u.scheme(), host, port), + None => format!("{}://{}/", u.scheme(), host), + } + }) } /// Convert a webclaw-pdf PdfResult into a webclaw-core ExtractionResult. diff --git a/crates/webclaw-llm/src/providers/anthropic.rs b/crates/webclaw-llm/src/providers/anthropic.rs index e6e43c8..eb15973 100644 --- a/crates/webclaw-llm/src/providers/anthropic.rs +++ b/crates/webclaw-llm/src/providers/anthropic.rs @@ -1,6 +1,8 @@ /// Anthropic provider — Claude models via api.anthropic.com. /// Anthropic's API differs from OpenAI: system message is a top-level param, /// not part of the messages array. +use std::time::Duration; + use async_trait::async_trait; use serde_json::json; @@ -35,7 +37,11 @@ impl AnthropicProvider { let key = load_api_key(key_override, "ANTHROPIC_API_KEY")?; Some(Self { - client: reqwest::Client::new(), + client: reqwest::Client::builder() + .timeout(Duration::from_secs(120)) + .connect_timeout(Duration::from_secs(10)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()), key, base_url: base_url .or_else(|| std::env::var("ANTHROPIC_BASE_URL").ok()) @@ -108,11 +114,7 @@ impl LlmProvider for AnthropicProvider { if !resp.status().is_success() { let status = resp.status(); let text = resp.text().await.unwrap_or_default(); - let safe_text = if text.len() > 500 { - &text[..500] - } else { - &text - }; + let safe_text = text.chars().take(500).collect::(); return Err(LlmError::ProviderError(format!( "anthropic returned {status}: {safe_text}" ))); diff --git a/crates/webclaw-llm/src/providers/ollama.rs b/crates/webclaw-llm/src/providers/ollama.rs index 9ee66c9..e7e3891 100644 --- a/crates/webclaw-llm/src/providers/ollama.rs +++ b/crates/webclaw-llm/src/providers/ollama.rs @@ -1,5 +1,7 @@ /// Ollama provider — talks to a local Ollama instance (default localhost:11434). /// First choice in the provider chain: free, private, fast on Apple Silicon. +use std::time::Duration; + use async_trait::async_trait; use serde_json::json; @@ -24,7 +26,11 @@ impl OllamaProvider { .unwrap_or_else(|| "qwen3:8b".into()); Self { - client: reqwest::Client::new(), + client: reqwest::Client::builder() + .timeout(Duration::from_secs(120)) + .connect_timeout(Duration::from_secs(10)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()), base_url, default_model, } @@ -70,11 +76,7 @@ impl LlmProvider for OllamaProvider { if !resp.status().is_success() { let status = resp.status(); let text = resp.text().await.unwrap_or_default(); - let safe_text = if text.len() > 500 { - &text[..500] - } else { - &text - }; + let safe_text = text.chars().take(500).collect::(); return Err(LlmError::ProviderError(format!( "ollama returned {status}: {safe_text}" ))); @@ -98,7 +100,8 @@ impl LlmProvider for OllamaProvider { async fn is_available(&self) -> bool { let url = format!("{}/api/tags", self.base_url); - matches!(self.client.get(&url).send().await, Ok(r) if r.status().is_success()) + let req = self.client.get(&url).timeout(Duration::from_secs(10)); + matches!(req.send().await, Ok(r) if r.status().is_success()) } fn name(&self) -> &str { diff --git a/crates/webclaw-llm/src/providers/openai.rs b/crates/webclaw-llm/src/providers/openai.rs index 3780d8f..af15fcd 100644 --- a/crates/webclaw-llm/src/providers/openai.rs +++ b/crates/webclaw-llm/src/providers/openai.rs @@ -1,4 +1,6 @@ /// OpenAI provider — works with api.openai.com and any OpenAI-compatible endpoint. +use std::time::Duration; + use async_trait::async_trait; use serde_json::json; @@ -69,7 +71,11 @@ impl OpenAiProvider { let key = load_api_key(key_override, "OPENAI_API_KEY")?; Some(Self { - client: reqwest::Client::new(), + client: reqwest::Client::builder() + .timeout(Duration::from_secs(120)) + .connect_timeout(Duration::from_secs(10)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()), key, base_url: base_url .or_else(|| std::env::var("OPENAI_BASE_URL").ok()) @@ -132,11 +138,7 @@ impl LlmProvider for OpenAiProvider { if !resp.status().is_success() { let status = resp.status(); let text = resp.text().await.unwrap_or_default(); - let safe_text = if text.len() > 500 { - &text[..500] - } else { - &text - }; + let safe_text = text.chars().take(500).collect::(); return Err(LlmError::ProviderError(format!( "openai returned {status}: {safe_text}" ))); diff --git a/crates/webclaw-mcp/src/server.rs b/crates/webclaw-mcp/src/server.rs index 497315f..67cf06a 100644 --- a/crates/webclaw-mcp/src/server.rs +++ b/crates/webclaw-mcp/src/server.rs @@ -323,9 +323,10 @@ impl WebclawMcp { if params.urls.len() > 100 { return Err("batch is limited to 100 URLs per request".into()); } - for u in ¶ms.urls { - validate_url(u).await?; - } + // No up-front DNS pre-validation: it aborted the whole batch on a + // single unresolvable URL. The fetch layer applies the same SSRF + // guard (validate_public_http_url) per URL, so bad entries surface + // as individual per-URL errors below instead of failing the batch. let format = params.format.as_deref().unwrap_or("markdown"); let concurrency = params.concurrency.unwrap_or(5);