From ea9c783bc515be185ced912ee3913dd27dbf5a06 Mon Sep 17 00:00:00 2001 From: Valerio Date: Tue, 24 Mar 2026 17:25:05 +0100 Subject: [PATCH] =?UTF-8?q?fix:=20v0.1.1=20=E2=80=94=20MCP=20identity,=20t?= =?UTF-8?q?imeouts,=20exit=20codes,=20URL=20validation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical: - MCP server identifies as "webclaw-mcp" instead of "rmcp" - Research tool poll loop capped at 200 iterations (~10 min) CLI: - Non-zero exit codes on errors - Text format strips markdown table syntax MCP server: - URL validation on all tools - 60s cloud API timeout, 30s local fetch timeout - Diff cloud fallback computes actual diff - Batch capped at 100 URLs, crawl at 500 pages - Graceful startup failure instead of panic Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 18 ++++ Cargo.lock | 13 +-- Cargo.toml | 2 +- crates/webclaw-cli/src/main.rs | 12 ++- crates/webclaw-core/src/markdown.rs | 42 +++++++++- crates/webclaw-mcp/Cargo.toml | 1 + crates/webclaw-mcp/src/cloud.rs | 16 ++-- crates/webclaw-mcp/src/server.rs | 124 ++++++++++++++++++++++++---- 8 files changed, 194 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f4da1d8..baba179 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,24 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.1.1] — 2026-03-24 + +### Fixed +- MCP server now identifies as `webclaw-mcp` instead of `rmcp` in the MCP handshake +- Research tool polling caps at 200 iterations (~10 min) instead of looping forever +- CLI returns non-zero exit codes on errors (invalid format, fetch failures, missing LLM) +- Text format output strips markdown table syntax (`| --- |` pipes) +- All MCP tools validate URLs before network calls with clear error messages +- Cloud API HTTP client has 60s timeout instead of no timeout +- Local fetch calls timeout after 30s to prevent hanging on slow servers +- Diff cloud fallback computes actual diff instead of returning raw scrape JSON +- FetchClient startup failure logs and exits gracefully instead of panicking + +### Added +- Upper bounds: batch capped at 100 URLs, crawl capped at 500 pages + +--- + ## [0.1.0] — 2026-03-18 First public release. Full-featured web content extraction toolkit for LLMs. diff --git a/Cargo.lock b/Cargo.lock index 19478f8..b4b4ed8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2854,7 +2854,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.1.0" +version = "0.1.1" dependencies = [ "clap", "dotenvy", @@ -2874,7 +2874,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.1.0" +version = "0.1.1" dependencies = [ "ego-tree", "once_cell", @@ -2891,7 +2891,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.1.0" +version = "0.1.1" dependencies = [ "primp", "quick-xml", @@ -2909,7 +2909,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.1.0" +version = "0.1.1" dependencies = [ "async-trait", "reqwest", @@ -2922,7 +2922,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.1.0" +version = "0.1.1" dependencies = [ "dotenvy", "reqwest", @@ -2933,6 +2933,7 @@ dependencies = [ "tokio", "tracing", "tracing-subscriber", + "url", "webclaw-core", "webclaw-fetch", "webclaw-llm", @@ -2941,7 +2942,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.1.0" +version = "0.1.1" dependencies = [ "pdf-extract", "thiserror", diff --git a/Cargo.toml b/Cargo.toml index c87a9d7..76ab6a0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.1.0" +version = "0.1.1" edition = "2024" license = "MIT" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index 8d9fb32..af5b0af 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -961,7 +961,11 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> { result.total, result.ok, result.errors, result.elapsed_secs, ); - Ok(()) + if result.errors > 0 { + Err(format!("{} of {} pages failed", result.errors, result.total)) + } else { + Ok(()) + } } async fn run_map(cli: &Cli) -> Result<(), String> { @@ -1023,7 +1027,11 @@ async fn run_batch(cli: &Cli, urls: &[String]) -> Result<(), String> { errors ); - Ok(()) + if errors > 0 { + Err(format!("{errors} of {} URLs failed", results.len())) + } else { + Ok(()) + } } async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> { diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs index fcfda0e..91f2c6e 100644 --- a/crates/webclaw-core/src/markdown.rs +++ b/crates/webclaw-core/src/markdown.rs @@ -786,6 +786,9 @@ fn strip_markdown(md: &str) -> String { static ITALIC_RE: Lazy = Lazy::new(|| Regex::new(r"\*([^*]+)\*").unwrap()); static CODE_RE: Lazy = Lazy::new(|| Regex::new(r"`([^`]+)`").unwrap()); static HEADING_RE: Lazy = Lazy::new(|| Regex::new(r"(?m)^#{1,6}\s+").unwrap()); + // Table separator rows: | --- | --- | (with optional colons for alignment) + static TABLE_SEP_RE: Lazy = + Lazy::new(|| Regex::new(r"^\|\s*:?-{2,}:?\s*(\|\s*:?-{2,}:?\s*)*\|$").unwrap()); let s = IMG_RE.replace_all(md, "$1"); let s = LINK_RE.replace_all(&s, "$1"); @@ -794,15 +797,31 @@ fn strip_markdown(md: &str) -> String { let s = CODE_RE.replace_all(&s, "$1"); let s = HEADING_RE.replace_all(&s, ""); - // Remove fenced code block markers - let mut lines: Vec<&str> = Vec::new(); + // Remove fenced code block markers + strip table syntax + let mut lines: Vec = Vec::new(); let mut in_fence = false; for line in s.lines() { if line.trim_start().starts_with("```") { in_fence = !in_fence; continue; } - lines.push(line); + + let trimmed = line.trim(); + + // Skip table separator rows (| --- | --- |) + if TABLE_SEP_RE.is_match(trimmed) { + continue; + } + + // Convert table data rows: strip leading/trailing pipes, replace inner pipes with tabs + if trimmed.starts_with('|') && trimmed.ends_with('|') { + let inner = &trimmed[1..trimmed.len() - 1]; + let cells: Vec<&str> = inner.split('|').map(|c| c.trim()).collect(); + lines.push(cells.join("\t")); + continue; + } + + lines.push(line.to_string()); } lines.join("\n") @@ -993,6 +1012,23 @@ mod tests { assert!(!plain.contains("[")); } + #[test] + fn strips_table_syntax_from_plain_text() { + let html = r##" + + + +
NameAge
Alice30
"##; + let (md, plain, _) = convert_html(html, None); + // Markdown should have table syntax + assert!(md.contains("| --- |")); + // Plain text should NOT have any pipe or separator syntax + assert!(!plain.contains("| --- |"), "separator row leaked: {plain}"); + assert!(!plain.contains("| Name"), "pipe syntax leaked: {plain}"); + assert!(plain.contains("Name"), "table content missing: {plain}"); + assert!(plain.contains("Alice"), "table content missing: {plain}"); + } + #[test] fn nested_list() { let html = r##" diff --git a/crates/webclaw-mcp/Cargo.toml b/crates/webclaw-mcp/Cargo.toml index 96e4776..acebfc7 100644 --- a/crates/webclaw-mcp/Cargo.toml +++ b/crates/webclaw-mcp/Cargo.toml @@ -23,3 +23,4 @@ tokio = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } +url = "2" diff --git a/crates/webclaw-mcp/src/cloud.rs b/crates/webclaw-mcp/src/cloud.rs index 565a172..bf05ffe 100644 --- a/crates/webclaw-mcp/src/cloud.rs +++ b/crates/webclaw-mcp/src/cloud.rs @@ -3,6 +3,7 @@ /// When local fetch returns a challenge page, this module retries /// via api.webclaw.io. Requires WEBCLAW_API_KEY to be set. use std::collections::HashMap; +use std::time::Duration; use serde_json::{Value, json}; use tracing::info; @@ -23,10 +24,11 @@ impl CloudClient { if key.is_empty() { return None; } - Some(Self { - api_key: key, - http: reqwest::Client::new(), - }) + let http = reqwest::Client::builder() + .timeout(Duration::from_secs(60)) + .build() + .unwrap_or_default(); + Some(Self { api_key: key, http }) } /// Scrape a URL via the cloud API. Returns the response JSON. @@ -208,10 +210,10 @@ pub async fn smart_fetch( only_main_content: bool, formats: &[&str], ) -> Result { - // Step 1: Try local fetch - let fetch_result = client - .fetch(url) + // Step 1: Try local fetch (with timeout to avoid hanging on slow servers) + let fetch_result = tokio::time::timeout(Duration::from_secs(30), client.fetch(url)) .await + .map_err(|_| format!("Fetch timed out after 30s for {url}"))? .map_err(|e| format!("Fetch failed: {e}"))?; // Step 2: Check for bot protection diff --git a/crates/webclaw-mcp/src/server.rs b/crates/webclaw-mcp/src/server.rs index 788d1dd..e82e94f 100644 --- a/crates/webclaw-mcp/src/server.rs +++ b/crates/webclaw-mcp/src/server.rs @@ -5,13 +5,15 @@ /// to the webclaw cloud API (api.webclaw.io) when bot protection or /// JS rendering is detected. Set WEBCLAW_API_KEY for automatic fallback. use std::sync::Arc; +use std::time::Duration; use rmcp::handler::server::router::tool::ToolRouter; use rmcp::handler::server::wrapper::Parameters; use rmcp::model::{Implementation, ServerCapabilities, ServerInfo}; use rmcp::{ServerHandler, tool, tool_handler, tool_router}; use serde_json::json; -use tracing::{info, warn}; +use tracing::{error, info, warn}; +use url::Url; use crate::cloud::{self, CloudClient, SmartFetchResult}; use crate::tools::*; @@ -32,6 +34,27 @@ fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile { } } +/// Validate that a URL is non-empty and has an http or https scheme. +fn validate_url(url: &str) -> Result<(), String> { + if url.is_empty() { + return Err("Invalid URL: must not be empty".into()); + } + match Url::parse(url) { + Ok(parsed) if parsed.scheme() == "http" || parsed.scheme() == "https" => Ok(()), + Ok(parsed) => Err(format!( + "Invalid URL: scheme '{}' not allowed, must start with http:// or https://", + parsed.scheme() + )), + Err(e) => Err(format!("Invalid URL: {e}. Must start with http:// or https://")), + } +} + +/// Timeout for local fetch calls (prevents hanging on tarpitting servers). +const LOCAL_FETCH_TIMEOUT: Duration = Duration::from_secs(30); + +/// Maximum poll iterations for research jobs (~10 minutes at 3s intervals). +const RESEARCH_MAX_POLLS: u32 = 200; + #[tool_router] impl WebclawMcp { pub async fn new() -> Self { @@ -46,8 +69,13 @@ impl WebclawMcp { config.proxy_pool = pool; } - let fetch_client = - webclaw_fetch::FetchClient::new(config).expect("failed to build FetchClient"); + let fetch_client = match webclaw_fetch::FetchClient::new(config) { + Ok(client) => client, + Err(e) => { + error!("failed to build FetchClient: {e}"); + std::process::exit(1); + } + }; let chain = webclaw_llm::ProviderChain::default().await; let llm_chain = if chain.is_empty() { @@ -94,6 +122,7 @@ impl WebclawMcp { /// Automatically falls back to the webclaw cloud API when bot protection or JS rendering is detected. #[tool] async fn scrape(&self, Parameters(params): Parameters) -> Result { + validate_url(¶ms.url)?; let format = params.format.as_deref().unwrap_or("markdown"); let browser = parse_browser(params.browser.as_deref()); let include = params.include_selectors.unwrap_or_default(); @@ -158,6 +187,14 @@ impl WebclawMcp { /// Crawl a website starting from a seed URL, following links breadth-first up to a configurable depth and page limit. #[tool] async fn crawl(&self, Parameters(params): Parameters) -> Result { + validate_url(¶ms.url)?; + + if let Some(max) = params.max_pages { + if max > 500 { + return Err("max_pages cannot exceed 500".into()); + } + } + let format = params.format.as_deref().unwrap_or("markdown"); let config = webclaw_fetch::CrawlConfig { @@ -199,6 +236,7 @@ impl WebclawMcp { /// Discover URLs from a website's sitemaps (robots.txt + sitemap.xml). #[tool] async fn map(&self, Parameters(params): Parameters) -> Result { + validate_url(¶ms.url)?; let entries = webclaw_fetch::sitemap::discover(&self.fetch_client, ¶ms.url) .await .map_err(|e| format!("Sitemap discovery failed: {e}"))?; @@ -217,6 +255,12 @@ impl WebclawMcp { if params.urls.is_empty() { return Err("urls must not be empty".into()); } + if params.urls.len() > 100 { + return Err("batch is limited to 100 URLs per request".into()); + } + for u in ¶ms.urls { + validate_url(u)?; + } let format = params.format.as_deref().unwrap_or("markdown"); let concurrency = params.concurrency.unwrap_or(5); @@ -257,6 +301,7 @@ impl WebclawMcp { &self, Parameters(params): Parameters, ) -> Result { + validate_url(¶ms.url)?; let chain = self.llm_chain.as_ref().ok_or( "No LLM providers available. Set OPENAI_API_KEY or ANTHROPIC_API_KEY, or run Ollama locally.", )?; @@ -301,6 +346,7 @@ impl WebclawMcp { &self, Parameters(params): Parameters, ) -> Result { + validate_url(¶ms.url)?; let chain = self.llm_chain.as_ref().ok_or( "No LLM providers available. Set OPENAI_API_KEY or ANTHROPIC_API_KEY, or run Ollama locally.", )?; @@ -326,6 +372,7 @@ impl WebclawMcp { /// Automatically falls back to the webclaw cloud API when bot protection is detected. #[tool] async fn diff(&self, Parameters(params): Parameters) -> Result { + validate_url(¶ms.url)?; let previous: webclaw_core::ExtractionResult = serde_json::from_str(¶ms.previous_snapshot) .map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?; @@ -347,8 +394,47 @@ impl WebclawMcp { Ok(serde_json::to_string_pretty(&content_diff).unwrap_or_default()) } SmartFetchResult::Cloud(resp) => { - // Can't do local diff with cloud content, return the cloud response directly - Ok(serde_json::to_string_pretty(&resp).unwrap_or_default()) + // Extract markdown from the cloud response and build a minimal + // ExtractionResult so we can compute the diff locally. + let markdown = resp + .get("markdown") + .and_then(|v| v.as_str()) + .unwrap_or(""); + + if markdown.is_empty() { + return Err( + "Cloud API fallback returned no markdown content; cannot compute diff." + .into(), + ); + } + + let current = webclaw_core::ExtractionResult { + content: webclaw_core::Content { + markdown: markdown.to_string(), + plain_text: markdown.to_string(), + links: Vec::new(), + images: Vec::new(), + code_blocks: Vec::new(), + raw_html: None, + }, + metadata: webclaw_core::Metadata { + title: None, + description: None, + author: None, + published_date: None, + language: None, + url: Some(params.url.clone()), + site_name: None, + image: None, + favicon: None, + word_count: markdown.split_whitespace().count(), + }, + domain_data: None, + structured_data: Vec::new(), + }; + + let content_diff = webclaw_core::diff::diff(&previous, ¤t); + Ok(serde_json::to_string_pretty(&content_diff).unwrap_or_default()) } } } @@ -357,11 +443,12 @@ impl WebclawMcp { /// Automatically falls back to the webclaw cloud API when bot protection is detected. #[tool] async fn brand(&self, Parameters(params): Parameters) -> Result { - let fetch_result = self - .fetch_client - .fetch(¶ms.url) - .await - .map_err(|e| format!("Fetch failed: {e}"))?; + validate_url(¶ms.url)?; + let fetch_result = + tokio::time::timeout(LOCAL_FETCH_TIMEOUT, self.fetch_client.fetch(¶ms.url)) + .await + .map_err(|_| format!("Fetch timed out after 30s for {}", params.url))? + .map_err(|e| format!("Fetch failed: {e}"))?; // Check for bot protection before extracting brand if cloud::is_bot_protected(&fetch_result.html, &fetch_result.headers) { @@ -415,9 +502,9 @@ impl WebclawMcp { info!(job_id = %job_id, "research job started, polling for completion"); - // Poll until completed or failed - loop { - tokio::time::sleep(std::time::Duration::from_secs(3)).await; + // Poll until completed or failed, with a max iteration cap (~10 minutes) + for poll in 0..RESEARCH_MAX_POLLS { + tokio::time::sleep(Duration::from_secs(3)).await; let status_resp = cloud.get(&format!("research/{job_id}")).await?; let status = status_resp @@ -445,10 +532,17 @@ impl WebclawMcp { return Err(format!("Research job failed: {error}")); } _ => { - // Still processing, continue polling + if poll % 20 == 19 { + info!(job_id = %job_id, poll, "research still in progress..."); + } } } } + + Err(format!( + "Research job {job_id} timed out after ~10 minutes of polling. \ + Check status manually via the webclaw API: GET /v1/research/{job_id}" + )) } /// Search the web for a query and return structured results. Requires WEBCLAW_API_KEY. @@ -498,7 +592,7 @@ impl WebclawMcp { impl ServerHandler for WebclawMcp { fn get_info(&self) -> ServerInfo { ServerInfo::new(ServerCapabilities::builder().enable_tools().build()) - .with_server_info(Implementation::from_build_env()) + .with_server_info(Implementation::new("webclaw-mcp", env!("CARGO_PKG_VERSION"))) .with_instructions(String::from( "Webclaw MCP server -- web content extraction for AI agents. \ Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search.",