mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-07-02 04:08:08 +02:00
Merge 499345046c into d0d7b835f2
This commit is contained in:
commit
11a78c71fe
9 changed files with 117 additions and 51 deletions
12
CHANGELOG.md
12
CHANGELOG.md
|
|
@ -3,6 +3,18 @@
|
||||||
All notable changes to webclaw are documented here.
|
All notable changes to webclaw are documented here.
|
||||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||||
|
|
||||||
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Pages with multibyte text (accented or CJK characters) no longer panic or get mangled during extraction. API-endpoint discovery now cuts oversized scripts on a character boundary instead of crashing mid-character, and structured-data parsing preserves non-ASCII string values instead of turning them into mojibake.
|
||||||
|
- LLM error messages from a provider no longer panic when the error body contains multibyte characters near the truncation point.
|
||||||
|
- LLM provider requests now have explicit connect and overall timeouts, so a stalled or unreachable provider fails fast instead of hanging.
|
||||||
|
- Batch extraction in the MCP server no longer aborts the whole batch when a single URL fails to resolve; bad URLs are reported as individual per-URL errors and the rest still run.
|
||||||
|
- CLI crawl and batch runs now wait for the completion webhook to actually send before exiting, replacing a fixed delay that could cut the request off or waste time.
|
||||||
|
- Homepage warm-up requests now include the port for hosts on a non-default port, so those sites are warmed correctly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## [0.6.7] — 2026-06-09
|
## [0.6.7] — 2026-06-09
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
|
|
|
||||||
|
|
@ -1548,7 +1548,7 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
|
||||||
// Fire webhook on crawl complete
|
// Fire webhook on crawl complete
|
||||||
if let Some(ref webhook_url) = cli.webhook {
|
if let Some(ref webhook_url) = cli.webhook {
|
||||||
let urls: Vec<&str> = result.pages.iter().map(|p| p.url.as_str()).collect();
|
let urls: Vec<&str> = result.pages.iter().map(|p| p.url.as_str()).collect();
|
||||||
fire_webhook(
|
let handle = fire_webhook(
|
||||||
webhook_url,
|
webhook_url,
|
||||||
&serde_json::json!({
|
&serde_json::json!({
|
||||||
"event": "crawl_complete",
|
"event": "crawl_complete",
|
||||||
|
|
@ -1559,8 +1559,8 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
|
||||||
"urls": urls,
|
"urls": urls,
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
// Brief pause so the async webhook has time to fire
|
// Wait for the webhook to finish so the process doesn't exit mid-send.
|
||||||
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
|
let _ = handle.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
if result.errors > 0 {
|
if result.errors > 0 {
|
||||||
|
|
@ -1658,7 +1658,7 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<()
|
||||||
// Fire webhook on batch complete
|
// Fire webhook on batch complete
|
||||||
if let Some(ref webhook_url) = cli.webhook {
|
if let Some(ref webhook_url) = cli.webhook {
|
||||||
let urls: Vec<&str> = results.iter().map(|r| r.url.as_str()).collect();
|
let urls: Vec<&str> = results.iter().map(|r| r.url.as_str()).collect();
|
||||||
fire_webhook(
|
let handle = fire_webhook(
|
||||||
webhook_url,
|
webhook_url,
|
||||||
&serde_json::json!({
|
&serde_json::json!({
|
||||||
"event": "batch_complete",
|
"event": "batch_complete",
|
||||||
|
|
@ -1668,7 +1668,7 @@ async fn run_batch(cli: &Cli, entries: &[(String, Option<String>)]) -> Result<()
|
||||||
"urls": urls,
|
"urls": urls,
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
|
let _ = handle.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
if errors > 0 {
|
if errors > 0 {
|
||||||
|
|
@ -1742,9 +1742,12 @@ async fn spawn_on_change(cmd: &str, stdin_payload: &[u8]) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Fire a webhook POST with a JSON payload. Non-blocking — errors logged to stderr.
|
/// Fire a webhook POST with a JSON payload. Spawns the send on a background task
|
||||||
/// Auto-detects Discord and Slack webhook URLs and wraps the payload accordingly.
|
/// and returns its `JoinHandle` so callers that need delivery (e.g. one-shot
|
||||||
fn fire_webhook(url: &str, payload: &serde_json::Value) {
|
/// crawl/batch runs that exit immediately after) can `.await` it; long-running
|
||||||
|
/// loops can drop the handle and let it run fire-and-forget. Errors are logged
|
||||||
|
/// to stderr. Auto-detects Discord and Slack webhook URLs and wraps the payload.
|
||||||
|
fn fire_webhook(url: &str, payload: &serde_json::Value) -> tokio::task::JoinHandle<()> {
|
||||||
let url = url.to_string();
|
let url = url.to_string();
|
||||||
let is_discord = url.contains("discord.com/api/webhooks");
|
let is_discord = url.contains("discord.com/api/webhooks");
|
||||||
let is_slack = url.contains("hooks.slack.com");
|
let is_slack = url.contains("hooks.slack.com");
|
||||||
|
|
@ -1806,7 +1809,7 @@ fn fire_webhook(url: &str, payload: &serde_json::Value) {
|
||||||
},
|
},
|
||||||
Err(e) => eprintln!("[webhook] client error: {e}"),
|
Err(e) => eprintln!("[webhook] client error: {e}"),
|
||||||
}
|
}
|
||||||
});
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
|
async fn run_watch(cli: &Cli, urls: &[String]) -> Result<(), String> {
|
||||||
|
|
@ -2318,7 +2321,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
|
||||||
eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)");
|
eprintln!("Processed {total} URLs ({ok} ok, {errors} errors)");
|
||||||
|
|
||||||
if let Some(ref webhook_url) = cli.webhook {
|
if let Some(ref webhook_url) = cli.webhook {
|
||||||
fire_webhook(
|
let handle = fire_webhook(
|
||||||
webhook_url,
|
webhook_url,
|
||||||
&serde_json::json!({
|
&serde_json::json!({
|
||||||
"event": "batch_llm_complete",
|
"event": "batch_llm_complete",
|
||||||
|
|
@ -2327,7 +2330,7 @@ async fn run_batch_llm(cli: &Cli, entries: &[(String, Option<String>)]) -> Resul
|
||||||
"errors": errors,
|
"errors": errors,
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
|
let _ = handle.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
if errors > 0 {
|
if errors > 0 {
|
||||||
|
|
|
||||||
|
|
@ -233,7 +233,13 @@ pub fn extract_endpoints(
|
||||||
}
|
}
|
||||||
let slice = if text.len() > *budget {
|
let slice = if text.len() > *budget {
|
||||||
*truncated = true;
|
*truncated = true;
|
||||||
&text[..*budget]
|
// Snap the cut to a UTF-8 char boundary so non-ASCII content
|
||||||
|
// (multibyte codepoints straddling the budget) can't panic.
|
||||||
|
let mut cut = (*budget).min(text.len());
|
||||||
|
while cut > 0 && !text.is_char_boundary(cut) {
|
||||||
|
cut -= 1;
|
||||||
|
}
|
||||||
|
&text[..cut]
|
||||||
} else {
|
} else {
|
||||||
text
|
text
|
||||||
};
|
};
|
||||||
|
|
@ -512,4 +518,16 @@ mod tests {
|
||||||
);
|
);
|
||||||
assert!(r.hosts.iter().any(|h| h == "pubapi.ticketmaster.co.uk"));
|
assert!(r.hosts.iter().any(|h| h == "pubapi.ticketmaster.co.uk"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn scan_truncation_at_non_ascii_boundary_does_not_panic() {
|
||||||
|
// A bundle just over the scan budget, padded with a multibyte char
|
||||||
|
// ('é' is 2 bytes) so the cut lands mid-codepoint. The old
|
||||||
|
// `&text[..budget]` slice panicked here; the boundary snap must not.
|
||||||
|
let pad = "é".repeat(MAX_SCAN_BYTES); // ~2× budget in bytes
|
||||||
|
let bundle = format!("{pad} fetch(\"/api/x\")");
|
||||||
|
let bundles = vec![("big.js".to_string(), bundle)];
|
||||||
|
let r = extract_endpoints("<html></html>", "https://example.com/", &bundles);
|
||||||
|
assert!(r.truncated, "oversized bundle should mark truncated");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -178,7 +178,12 @@ pub fn extract_sveltekit(html: &str) -> Vec<Value> {
|
||||||
/// Preserves already-quoted keys and string values.
|
/// Preserves already-quoted keys and string values.
|
||||||
fn js_literal_to_json(input: &str) -> String {
|
fn js_literal_to_json(input: &str) -> String {
|
||||||
let bytes = input.as_bytes();
|
let bytes = input.as_bytes();
|
||||||
let mut out = String::with_capacity(input.len() + input.len() / 10);
|
// Accumulate raw bytes, not `byte as char`. The input is valid UTF-8 and we
|
||||||
|
// only ever copy its bytes verbatim or insert ASCII quotes, so the result is
|
||||||
|
// guaranteed valid UTF-8 — copying byte-by-byte preserves multibyte
|
||||||
|
// codepoints (e.g. accented/CJK string values) instead of mangling them
|
||||||
|
// into Latin-1 mojibake.
|
||||||
|
let mut out: Vec<u8> = Vec::with_capacity(input.len() + input.len() / 10);
|
||||||
let mut i = 0;
|
let mut i = 0;
|
||||||
let len = bytes.len();
|
let len = bytes.len();
|
||||||
|
|
||||||
|
|
@ -187,14 +192,14 @@ fn js_literal_to_json(input: &str) -> String {
|
||||||
|
|
||||||
// Skip through strings
|
// Skip through strings
|
||||||
if b == b'"' {
|
if b == b'"' {
|
||||||
out.push('"');
|
out.push(b'"');
|
||||||
i += 1;
|
i += 1;
|
||||||
while i < len {
|
while i < len {
|
||||||
let c = bytes[i];
|
let c = bytes[i];
|
||||||
out.push(c as char);
|
out.push(c);
|
||||||
i += 1;
|
i += 1;
|
||||||
if c == b'\\' && i < len {
|
if c == b'\\' && i < len {
|
||||||
out.push(bytes[i] as char);
|
out.push(bytes[i]);
|
||||||
i += 1;
|
i += 1;
|
||||||
} else if c == b'"' {
|
} else if c == b'"' {
|
||||||
break;
|
break;
|
||||||
|
|
@ -205,11 +210,11 @@ fn js_literal_to_json(input: &str) -> String {
|
||||||
|
|
||||||
// After { or , — look for unquoted key followed by :
|
// After { or , — look for unquoted key followed by :
|
||||||
if (b == b'{' || b == b',' || b == b'[') && i + 1 < len {
|
if (b == b'{' || b == b',' || b == b'[') && i + 1 < len {
|
||||||
out.push(b as char);
|
out.push(b);
|
||||||
i += 1;
|
i += 1;
|
||||||
// Skip whitespace
|
// Skip whitespace
|
||||||
while i < len && bytes[i].is_ascii_whitespace() {
|
while i < len && bytes[i].is_ascii_whitespace() {
|
||||||
out.push(bytes[i] as char);
|
out.push(bytes[i]);
|
||||||
i += 1;
|
i += 1;
|
||||||
}
|
}
|
||||||
// Check if next is an unquoted identifier (key)
|
// Check if next is an unquoted identifier (key)
|
||||||
|
|
@ -218,29 +223,30 @@ fn js_literal_to_json(input: &str) -> String {
|
||||||
while i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
|
while i < len && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
|
||||||
i += 1;
|
i += 1;
|
||||||
}
|
}
|
||||||
let key = &input[key_start..i];
|
let key = &bytes[key_start..i];
|
||||||
// Skip whitespace after key
|
// Skip whitespace after key
|
||||||
while i < len && bytes[i].is_ascii_whitespace() {
|
while i < len && bytes[i].is_ascii_whitespace() {
|
||||||
i += 1;
|
i += 1;
|
||||||
}
|
}
|
||||||
// If followed by :, it's an unquoted key — quote it
|
// If followed by :, it's an unquoted key — quote it
|
||||||
if i < len && bytes[i] == b':' {
|
if i < len && bytes[i] == b':' {
|
||||||
out.push('"');
|
out.push(b'"');
|
||||||
out.push_str(key);
|
out.extend_from_slice(key);
|
||||||
out.push('"');
|
out.push(b'"');
|
||||||
} else {
|
} else {
|
||||||
// Not a key — might be a bare value like true/false/null
|
// Not a key — might be a bare value like true/false/null
|
||||||
out.push_str(key);
|
out.extend_from_slice(key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
out.push(b as char);
|
out.push(b);
|
||||||
i += 1;
|
i += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
out
|
// Safe: we only copied bytes from valid-UTF-8 `input` plus ASCII quotes.
|
||||||
|
String::from_utf8(out).unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Replace raw newlines/tabs inside JSON string values with escape sequences.
|
/// Replace raw newlines/tabs inside JSON string values with escape sequences.
|
||||||
|
|
@ -440,4 +446,17 @@ newline"}"#;
|
||||||
assert_eq!(parsed["text"], "line1\nline2");
|
assert_eq!(parsed["text"], "line1\nline2");
|
||||||
assert_eq!(parsed["raw"], "has\nnewline");
|
assert_eq!(parsed["raw"], "has\nnewline");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn js_literal_to_json_preserves_multibyte_utf8() {
|
||||||
|
// Unquoted ASCII keys with accented and CJK string values (the shape
|
||||||
|
// SvelteKit emits). The old `byte as char` path turned the multibyte
|
||||||
|
// values into Latin-1 mojibake; they must now survive intact.
|
||||||
|
let input = r#"{name:"déjà vu", city:"東京", emoji:"🌱"}"#;
|
||||||
|
let json = js_literal_to_json(input);
|
||||||
|
let parsed: Value = serde_json::from_str(&json).unwrap();
|
||||||
|
assert_eq!(parsed["name"], "déjà vu");
|
||||||
|
assert_eq!(parsed["city"], "東京");
|
||||||
|
assert_eq!(parsed["emoji"], "🌱");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -801,11 +801,17 @@ fn is_challenge_html(html: &str) -> bool {
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract the homepage URL (scheme + host) from a full URL.
|
/// Extract the homepage URL (scheme + host[:port]) from a full URL.
|
||||||
fn extract_homepage(url: &str) -> Option<String> {
|
fn extract_homepage(url: &str) -> Option<String> {
|
||||||
url::Url::parse(url)
|
url::Url::parse(url).ok().map(|u| {
|
||||||
.ok()
|
let host = u.host_str().unwrap_or("");
|
||||||
.map(|u| format!("{}://{}/", u.scheme(), u.host_str().unwrap_or("")))
|
// `port()` is `Some` only for a non-default port; include it so a
|
||||||
|
// host like example.com:8443 is warmed on the right port.
|
||||||
|
match u.port() {
|
||||||
|
Some(port) => format!("{}://{}:{}/", u.scheme(), host, port),
|
||||||
|
None => format!("{}://{}/", u.scheme(), host),
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Convert a webclaw-pdf PdfResult into a webclaw-core ExtractionResult.
|
/// Convert a webclaw-pdf PdfResult into a webclaw-core ExtractionResult.
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,8 @@
|
||||||
/// Anthropic provider — Claude models via api.anthropic.com.
|
/// Anthropic provider — Claude models via api.anthropic.com.
|
||||||
/// Anthropic's API differs from OpenAI: system message is a top-level param,
|
/// Anthropic's API differs from OpenAI: system message is a top-level param,
|
||||||
/// not part of the messages array.
|
/// not part of the messages array.
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
|
||||||
|
|
@ -35,7 +37,11 @@ impl AnthropicProvider {
|
||||||
let key = load_api_key(key_override, "ANTHROPIC_API_KEY")?;
|
let key = load_api_key(key_override, "ANTHROPIC_API_KEY")?;
|
||||||
|
|
||||||
Some(Self {
|
Some(Self {
|
||||||
client: reqwest::Client::new(),
|
client: reqwest::Client::builder()
|
||||||
|
.timeout(Duration::from_secs(120))
|
||||||
|
.connect_timeout(Duration::from_secs(10))
|
||||||
|
.build()
|
||||||
|
.unwrap_or_else(|_| reqwest::Client::new()),
|
||||||
key,
|
key,
|
||||||
base_url: base_url
|
base_url: base_url
|
||||||
.or_else(|| std::env::var("ANTHROPIC_BASE_URL").ok())
|
.or_else(|| std::env::var("ANTHROPIC_BASE_URL").ok())
|
||||||
|
|
@ -108,11 +114,7 @@ impl LlmProvider for AnthropicProvider {
|
||||||
if !resp.status().is_success() {
|
if !resp.status().is_success() {
|
||||||
let status = resp.status();
|
let status = resp.status();
|
||||||
let text = resp.text().await.unwrap_or_default();
|
let text = resp.text().await.unwrap_or_default();
|
||||||
let safe_text = if text.len() > 500 {
|
let safe_text = text.chars().take(500).collect::<String>();
|
||||||
&text[..500]
|
|
||||||
} else {
|
|
||||||
&text
|
|
||||||
};
|
|
||||||
return Err(LlmError::ProviderError(format!(
|
return Err(LlmError::ProviderError(format!(
|
||||||
"anthropic returned {status}: {safe_text}"
|
"anthropic returned {status}: {safe_text}"
|
||||||
)));
|
)));
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
/// Ollama provider — talks to a local Ollama instance (default localhost:11434).
|
/// Ollama provider — talks to a local Ollama instance (default localhost:11434).
|
||||||
/// First choice in the provider chain: free, private, fast on Apple Silicon.
|
/// First choice in the provider chain: free, private, fast on Apple Silicon.
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
|
||||||
|
|
@ -24,7 +26,11 @@ impl OllamaProvider {
|
||||||
.unwrap_or_else(|| "qwen3:8b".into());
|
.unwrap_or_else(|| "qwen3:8b".into());
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
client: reqwest::Client::new(),
|
client: reqwest::Client::builder()
|
||||||
|
.timeout(Duration::from_secs(120))
|
||||||
|
.connect_timeout(Duration::from_secs(10))
|
||||||
|
.build()
|
||||||
|
.unwrap_or_else(|_| reqwest::Client::new()),
|
||||||
base_url,
|
base_url,
|
||||||
default_model,
|
default_model,
|
||||||
}
|
}
|
||||||
|
|
@ -70,11 +76,7 @@ impl LlmProvider for OllamaProvider {
|
||||||
if !resp.status().is_success() {
|
if !resp.status().is_success() {
|
||||||
let status = resp.status();
|
let status = resp.status();
|
||||||
let text = resp.text().await.unwrap_or_default();
|
let text = resp.text().await.unwrap_or_default();
|
||||||
let safe_text = if text.len() > 500 {
|
let safe_text = text.chars().take(500).collect::<String>();
|
||||||
&text[..500]
|
|
||||||
} else {
|
|
||||||
&text
|
|
||||||
};
|
|
||||||
return Err(LlmError::ProviderError(format!(
|
return Err(LlmError::ProviderError(format!(
|
||||||
"ollama returned {status}: {safe_text}"
|
"ollama returned {status}: {safe_text}"
|
||||||
)));
|
)));
|
||||||
|
|
@ -98,7 +100,8 @@ impl LlmProvider for OllamaProvider {
|
||||||
|
|
||||||
async fn is_available(&self) -> bool {
|
async fn is_available(&self) -> bool {
|
||||||
let url = format!("{}/api/tags", self.base_url);
|
let url = format!("{}/api/tags", self.base_url);
|
||||||
matches!(self.client.get(&url).send().await, Ok(r) if r.status().is_success())
|
let req = self.client.get(&url).timeout(Duration::from_secs(10));
|
||||||
|
matches!(req.send().await, Ok(r) if r.status().is_success())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn name(&self) -> &str {
|
fn name(&self) -> &str {
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,6 @@
|
||||||
/// OpenAI provider — works with api.openai.com and any OpenAI-compatible endpoint.
|
/// OpenAI provider — works with api.openai.com and any OpenAI-compatible endpoint.
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
|
||||||
|
|
@ -69,7 +71,11 @@ impl OpenAiProvider {
|
||||||
let key = load_api_key(key_override, "OPENAI_API_KEY")?;
|
let key = load_api_key(key_override, "OPENAI_API_KEY")?;
|
||||||
|
|
||||||
Some(Self {
|
Some(Self {
|
||||||
client: reqwest::Client::new(),
|
client: reqwest::Client::builder()
|
||||||
|
.timeout(Duration::from_secs(120))
|
||||||
|
.connect_timeout(Duration::from_secs(10))
|
||||||
|
.build()
|
||||||
|
.unwrap_or_else(|_| reqwest::Client::new()),
|
||||||
key,
|
key,
|
||||||
base_url: base_url
|
base_url: base_url
|
||||||
.or_else(|| std::env::var("OPENAI_BASE_URL").ok())
|
.or_else(|| std::env::var("OPENAI_BASE_URL").ok())
|
||||||
|
|
@ -132,11 +138,7 @@ impl LlmProvider for OpenAiProvider {
|
||||||
if !resp.status().is_success() {
|
if !resp.status().is_success() {
|
||||||
let status = resp.status();
|
let status = resp.status();
|
||||||
let text = resp.text().await.unwrap_or_default();
|
let text = resp.text().await.unwrap_or_default();
|
||||||
let safe_text = if text.len() > 500 {
|
let safe_text = text.chars().take(500).collect::<String>();
|
||||||
&text[..500]
|
|
||||||
} else {
|
|
||||||
&text
|
|
||||||
};
|
|
||||||
return Err(LlmError::ProviderError(format!(
|
return Err(LlmError::ProviderError(format!(
|
||||||
"openai returned {status}: {safe_text}"
|
"openai returned {status}: {safe_text}"
|
||||||
)));
|
)));
|
||||||
|
|
|
||||||
|
|
@ -323,9 +323,10 @@ impl WebclawMcp {
|
||||||
if params.urls.len() > 100 {
|
if params.urls.len() > 100 {
|
||||||
return Err("batch is limited to 100 URLs per request".into());
|
return Err("batch is limited to 100 URLs per request".into());
|
||||||
}
|
}
|
||||||
for u in ¶ms.urls {
|
// No up-front DNS pre-validation: it aborted the whole batch on a
|
||||||
validate_url(u).await?;
|
// single unresolvable URL. The fetch layer applies the same SSRF
|
||||||
}
|
// guard (validate_public_http_url) per URL, so bad entries surface
|
||||||
|
// as individual per-URL errors below instead of failing the batch.
|
||||||
|
|
||||||
let format = params.format.as_deref().unwrap_or("markdown");
|
let format = params.format.as_deref().unwrap_or("markdown");
|
||||||
let concurrency = params.concurrency.unwrap_or(5);
|
let concurrency = params.concurrency.unwrap_or(5);
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue