fix: v0.1.1 — MCP identity, timeouts, exit codes, URL validation

Critical:
- MCP server identifies as "webclaw-mcp" instead of "rmcp"
- Research tool poll loop capped at 200 iterations (~10 min)

CLI:
- Non-zero exit codes on errors
- Text format strips markdown table syntax

MCP server:
- URL validation on all tools
- 60s cloud API timeout, 30s local fetch timeout
- Diff cloud fallback computes actual diff
- Batch capped at 100 URLs, crawl at 500 pages
- Graceful startup failure instead of panic

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Valerio 2026-03-24 17:25:05 +01:00
parent 09fa3f5fc9
commit ea9c783bc5
8 changed files with 194 additions and 34 deletions

View file

@ -3,6 +3,24 @@
All notable changes to webclaw are documented here. All notable changes to webclaw are documented here.
Format follows [Keep a Changelog](https://keepachangelog.com/). Format follows [Keep a Changelog](https://keepachangelog.com/).
## [0.1.1] — 2026-03-24
### Fixed
- MCP server now identifies as `webclaw-mcp` instead of `rmcp` in the MCP handshake
- Research tool polling caps at 200 iterations (~10 min) instead of looping forever
- CLI returns non-zero exit codes on errors (invalid format, fetch failures, missing LLM)
- Text format output strips markdown table syntax (`| --- |` pipes)
- All MCP tools validate URLs before network calls with clear error messages
- Cloud API HTTP client has 60s timeout instead of no timeout
- Local fetch calls timeout after 30s to prevent hanging on slow servers
- Diff cloud fallback computes actual diff instead of returning raw scrape JSON
- FetchClient startup failure logs and exits gracefully instead of panicking
### Added
- Upper bounds: batch capped at 100 URLs, crawl capped at 500 pages
---
## [0.1.0] — 2026-03-18 ## [0.1.0] — 2026-03-18
First public release. Full-featured web content extraction toolkit for LLMs. First public release. Full-featured web content extraction toolkit for LLMs.

13
Cargo.lock generated
View file

@ -2854,7 +2854,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-cli" name = "webclaw-cli"
version = "0.1.0" version = "0.1.1"
dependencies = [ dependencies = [
"clap", "clap",
"dotenvy", "dotenvy",
@ -2874,7 +2874,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-core" name = "webclaw-core"
version = "0.1.0" version = "0.1.1"
dependencies = [ dependencies = [
"ego-tree", "ego-tree",
"once_cell", "once_cell",
@ -2891,7 +2891,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-fetch" name = "webclaw-fetch"
version = "0.1.0" version = "0.1.1"
dependencies = [ dependencies = [
"primp", "primp",
"quick-xml", "quick-xml",
@ -2909,7 +2909,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-llm" name = "webclaw-llm"
version = "0.1.0" version = "0.1.1"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"reqwest", "reqwest",
@ -2922,7 +2922,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-mcp" name = "webclaw-mcp"
version = "0.1.0" version = "0.1.1"
dependencies = [ dependencies = [
"dotenvy", "dotenvy",
"reqwest", "reqwest",
@ -2933,6 +2933,7 @@ dependencies = [
"tokio", "tokio",
"tracing", "tracing",
"tracing-subscriber", "tracing-subscriber",
"url",
"webclaw-core", "webclaw-core",
"webclaw-fetch", "webclaw-fetch",
"webclaw-llm", "webclaw-llm",
@ -2941,7 +2942,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-pdf" name = "webclaw-pdf"
version = "0.1.0" version = "0.1.1"
dependencies = [ dependencies = [
"pdf-extract", "pdf-extract",
"thiserror", "thiserror",

View file

@ -3,7 +3,7 @@ resolver = "2"
members = ["crates/*"] members = ["crates/*"]
[workspace.package] [workspace.package]
version = "0.1.0" version = "0.1.1"
edition = "2024" edition = "2024"
license = "MIT" license = "MIT"
repository = "https://github.com/0xMassi/webclaw" repository = "https://github.com/0xMassi/webclaw"

View file

@ -961,7 +961,11 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
result.total, result.ok, result.errors, result.elapsed_secs, result.total, result.ok, result.errors, result.elapsed_secs,
); );
Ok(()) if result.errors > 0 {
Err(format!("{} of {} pages failed", result.errors, result.total))
} else {
Ok(())
}
} }
async fn run_map(cli: &Cli) -> Result<(), String> { async fn run_map(cli: &Cli) -> Result<(), String> {
@ -1023,7 +1027,11 @@ async fn run_batch(cli: &Cli, urls: &[String]) -> Result<(), String> {
errors errors
); );
Ok(()) if errors > 0 {
Err(format!("{errors} of {} URLs failed", results.len()))
} else {
Ok(())
}
} }
async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> { async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {

View file

@ -786,6 +786,9 @@ fn strip_markdown(md: &str) -> String {
static ITALIC_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\*([^*]+)\*").unwrap()); static ITALIC_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\*([^*]+)\*").unwrap());
static CODE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"`([^`]+)`").unwrap()); static CODE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"`([^`]+)`").unwrap());
static HEADING_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?m)^#{1,6}\s+").unwrap()); static HEADING_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?m)^#{1,6}\s+").unwrap());
// Table separator rows: | --- | --- | (with optional colons for alignment)
static TABLE_SEP_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^\|\s*:?-{2,}:?\s*(\|\s*:?-{2,}:?\s*)*\|$").unwrap());
let s = IMG_RE.replace_all(md, "$1"); let s = IMG_RE.replace_all(md, "$1");
let s = LINK_RE.replace_all(&s, "$1"); let s = LINK_RE.replace_all(&s, "$1");
@ -794,15 +797,31 @@ fn strip_markdown(md: &str) -> String {
let s = CODE_RE.replace_all(&s, "$1"); let s = CODE_RE.replace_all(&s, "$1");
let s = HEADING_RE.replace_all(&s, ""); let s = HEADING_RE.replace_all(&s, "");
// Remove fenced code block markers // Remove fenced code block markers + strip table syntax
let mut lines: Vec<&str> = Vec::new(); let mut lines: Vec<String> = Vec::new();
let mut in_fence = false; let mut in_fence = false;
for line in s.lines() { for line in s.lines() {
if line.trim_start().starts_with("```") { if line.trim_start().starts_with("```") {
in_fence = !in_fence; in_fence = !in_fence;
continue; continue;
} }
lines.push(line);
let trimmed = line.trim();
// Skip table separator rows (| --- | --- |)
if TABLE_SEP_RE.is_match(trimmed) {
continue;
}
// Convert table data rows: strip leading/trailing pipes, replace inner pipes with tabs
if trimmed.starts_with('|') && trimmed.ends_with('|') {
let inner = &trimmed[1..trimmed.len() - 1];
let cells: Vec<&str> = inner.split('|').map(|c| c.trim()).collect();
lines.push(cells.join("\t"));
continue;
}
lines.push(line.to_string());
} }
lines.join("\n") lines.join("\n")
@ -993,6 +1012,23 @@ mod tests {
assert!(!plain.contains("[")); assert!(!plain.contains("["));
} }
#[test]
fn strips_table_syntax_from_plain_text() {
let html = r##"
<table>
<thead><tr><th>Name</th><th>Age</th></tr></thead>
<tbody><tr><td>Alice</td><td>30</td></tr></tbody>
</table>"##;
let (md, plain, _) = convert_html(html, None);
// Markdown should have table syntax
assert!(md.contains("| --- |"));
// Plain text should NOT have any pipe or separator syntax
assert!(!plain.contains("| --- |"), "separator row leaked: {plain}");
assert!(!plain.contains("| Name"), "pipe syntax leaked: {plain}");
assert!(plain.contains("Name"), "table content missing: {plain}");
assert!(plain.contains("Alice"), "table content missing: {plain}");
}
#[test] #[test]
fn nested_list() { fn nested_list() {
let html = r##" let html = r##"

View file

@ -23,3 +23,4 @@ tokio = { workspace = true }
tracing = { workspace = true } tracing = { workspace = true }
tracing-subscriber = { workspace = true } tracing-subscriber = { workspace = true }
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
url = "2"

View file

@ -3,6 +3,7 @@
/// When local fetch returns a challenge page, this module retries /// When local fetch returns a challenge page, this module retries
/// via api.webclaw.io. Requires WEBCLAW_API_KEY to be set. /// via api.webclaw.io. Requires WEBCLAW_API_KEY to be set.
use std::collections::HashMap; use std::collections::HashMap;
use std::time::Duration;
use serde_json::{Value, json}; use serde_json::{Value, json};
use tracing::info; use tracing::info;
@ -23,10 +24,11 @@ impl CloudClient {
if key.is_empty() { if key.is_empty() {
return None; return None;
} }
Some(Self { let http = reqwest::Client::builder()
api_key: key, .timeout(Duration::from_secs(60))
http: reqwest::Client::new(), .build()
}) .unwrap_or_default();
Some(Self { api_key: key, http })
} }
/// Scrape a URL via the cloud API. Returns the response JSON. /// Scrape a URL via the cloud API. Returns the response JSON.
@ -208,10 +210,10 @@ pub async fn smart_fetch(
only_main_content: bool, only_main_content: bool,
formats: &[&str], formats: &[&str],
) -> Result<SmartFetchResult, String> { ) -> Result<SmartFetchResult, String> {
// Step 1: Try local fetch // Step 1: Try local fetch (with timeout to avoid hanging on slow servers)
let fetch_result = client let fetch_result = tokio::time::timeout(Duration::from_secs(30), client.fetch(url))
.fetch(url)
.await .await
.map_err(|_| format!("Fetch timed out after 30s for {url}"))?
.map_err(|e| format!("Fetch failed: {e}"))?; .map_err(|e| format!("Fetch failed: {e}"))?;
// Step 2: Check for bot protection // Step 2: Check for bot protection

View file

@ -5,13 +5,15 @@
/// to the webclaw cloud API (api.webclaw.io) when bot protection or /// to the webclaw cloud API (api.webclaw.io) when bot protection or
/// JS rendering is detected. Set WEBCLAW_API_KEY for automatic fallback. /// JS rendering is detected. Set WEBCLAW_API_KEY for automatic fallback.
use std::sync::Arc; use std::sync::Arc;
use std::time::Duration;
use rmcp::handler::server::router::tool::ToolRouter; use rmcp::handler::server::router::tool::ToolRouter;
use rmcp::handler::server::wrapper::Parameters; use rmcp::handler::server::wrapper::Parameters;
use rmcp::model::{Implementation, ServerCapabilities, ServerInfo}; use rmcp::model::{Implementation, ServerCapabilities, ServerInfo};
use rmcp::{ServerHandler, tool, tool_handler, tool_router}; use rmcp::{ServerHandler, tool, tool_handler, tool_router};
use serde_json::json; use serde_json::json;
use tracing::{info, warn}; use tracing::{error, info, warn};
use url::Url;
use crate::cloud::{self, CloudClient, SmartFetchResult}; use crate::cloud::{self, CloudClient, SmartFetchResult};
use crate::tools::*; use crate::tools::*;
@ -32,6 +34,27 @@ fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile {
} }
} }
/// Validate that a URL is non-empty and has an http or https scheme.
fn validate_url(url: &str) -> Result<(), String> {
if url.is_empty() {
return Err("Invalid URL: must not be empty".into());
}
match Url::parse(url) {
Ok(parsed) if parsed.scheme() == "http" || parsed.scheme() == "https" => Ok(()),
Ok(parsed) => Err(format!(
"Invalid URL: scheme '{}' not allowed, must start with http:// or https://",
parsed.scheme()
)),
Err(e) => Err(format!("Invalid URL: {e}. Must start with http:// or https://")),
}
}
/// Timeout for local fetch calls (prevents hanging on tarpitting servers).
const LOCAL_FETCH_TIMEOUT: Duration = Duration::from_secs(30);
/// Maximum poll iterations for research jobs (~10 minutes at 3s intervals).
const RESEARCH_MAX_POLLS: u32 = 200;
#[tool_router] #[tool_router]
impl WebclawMcp { impl WebclawMcp {
pub async fn new() -> Self { pub async fn new() -> Self {
@ -46,8 +69,13 @@ impl WebclawMcp {
config.proxy_pool = pool; config.proxy_pool = pool;
} }
let fetch_client = let fetch_client = match webclaw_fetch::FetchClient::new(config) {
webclaw_fetch::FetchClient::new(config).expect("failed to build FetchClient"); Ok(client) => client,
Err(e) => {
error!("failed to build FetchClient: {e}");
std::process::exit(1);
}
};
let chain = webclaw_llm::ProviderChain::default().await; let chain = webclaw_llm::ProviderChain::default().await;
let llm_chain = if chain.is_empty() { let llm_chain = if chain.is_empty() {
@ -94,6 +122,7 @@ impl WebclawMcp {
/// Automatically falls back to the webclaw cloud API when bot protection or JS rendering is detected. /// Automatically falls back to the webclaw cloud API when bot protection or JS rendering is detected.
#[tool] #[tool]
async fn scrape(&self, Parameters(params): Parameters<ScrapeParams>) -> Result<String, String> { async fn scrape(&self, Parameters(params): Parameters<ScrapeParams>) -> Result<String, String> {
validate_url(&params.url)?;
let format = params.format.as_deref().unwrap_or("markdown"); let format = params.format.as_deref().unwrap_or("markdown");
let browser = parse_browser(params.browser.as_deref()); let browser = parse_browser(params.browser.as_deref());
let include = params.include_selectors.unwrap_or_default(); let include = params.include_selectors.unwrap_or_default();
@ -158,6 +187,14 @@ impl WebclawMcp {
/// Crawl a website starting from a seed URL, following links breadth-first up to a configurable depth and page limit. /// Crawl a website starting from a seed URL, following links breadth-first up to a configurable depth and page limit.
#[tool] #[tool]
async fn crawl(&self, Parameters(params): Parameters<CrawlParams>) -> Result<String, String> { async fn crawl(&self, Parameters(params): Parameters<CrawlParams>) -> Result<String, String> {
validate_url(&params.url)?;
if let Some(max) = params.max_pages {
if max > 500 {
return Err("max_pages cannot exceed 500".into());
}
}
let format = params.format.as_deref().unwrap_or("markdown"); let format = params.format.as_deref().unwrap_or("markdown");
let config = webclaw_fetch::CrawlConfig { let config = webclaw_fetch::CrawlConfig {
@ -199,6 +236,7 @@ impl WebclawMcp {
/// Discover URLs from a website's sitemaps (robots.txt + sitemap.xml). /// Discover URLs from a website's sitemaps (robots.txt + sitemap.xml).
#[tool] #[tool]
async fn map(&self, Parameters(params): Parameters<MapParams>) -> Result<String, String> { async fn map(&self, Parameters(params): Parameters<MapParams>) -> Result<String, String> {
validate_url(&params.url)?;
let entries = webclaw_fetch::sitemap::discover(&self.fetch_client, &params.url) let entries = webclaw_fetch::sitemap::discover(&self.fetch_client, &params.url)
.await .await
.map_err(|e| format!("Sitemap discovery failed: {e}"))?; .map_err(|e| format!("Sitemap discovery failed: {e}"))?;
@ -217,6 +255,12 @@ impl WebclawMcp {
if params.urls.is_empty() { if params.urls.is_empty() {
return Err("urls must not be empty".into()); return Err("urls must not be empty".into());
} }
if params.urls.len() > 100 {
return Err("batch is limited to 100 URLs per request".into());
}
for u in &params.urls {
validate_url(u)?;
}
let format = params.format.as_deref().unwrap_or("markdown"); let format = params.format.as_deref().unwrap_or("markdown");
let concurrency = params.concurrency.unwrap_or(5); let concurrency = params.concurrency.unwrap_or(5);
@ -257,6 +301,7 @@ impl WebclawMcp {
&self, &self,
Parameters(params): Parameters<ExtractParams>, Parameters(params): Parameters<ExtractParams>,
) -> Result<String, String> { ) -> Result<String, String> {
validate_url(&params.url)?;
let chain = self.llm_chain.as_ref().ok_or( let chain = self.llm_chain.as_ref().ok_or(
"No LLM providers available. Set OPENAI_API_KEY or ANTHROPIC_API_KEY, or run Ollama locally.", "No LLM providers available. Set OPENAI_API_KEY or ANTHROPIC_API_KEY, or run Ollama locally.",
)?; )?;
@ -301,6 +346,7 @@ impl WebclawMcp {
&self, &self,
Parameters(params): Parameters<SummarizeParams>, Parameters(params): Parameters<SummarizeParams>,
) -> Result<String, String> { ) -> Result<String, String> {
validate_url(&params.url)?;
let chain = self.llm_chain.as_ref().ok_or( let chain = self.llm_chain.as_ref().ok_or(
"No LLM providers available. Set OPENAI_API_KEY or ANTHROPIC_API_KEY, or run Ollama locally.", "No LLM providers available. Set OPENAI_API_KEY or ANTHROPIC_API_KEY, or run Ollama locally.",
)?; )?;
@ -326,6 +372,7 @@ impl WebclawMcp {
/// Automatically falls back to the webclaw cloud API when bot protection is detected. /// Automatically falls back to the webclaw cloud API when bot protection is detected.
#[tool] #[tool]
async fn diff(&self, Parameters(params): Parameters<DiffParams>) -> Result<String, String> { async fn diff(&self, Parameters(params): Parameters<DiffParams>) -> Result<String, String> {
validate_url(&params.url)?;
let previous: webclaw_core::ExtractionResult = let previous: webclaw_core::ExtractionResult =
serde_json::from_str(&params.previous_snapshot) serde_json::from_str(&params.previous_snapshot)
.map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?; .map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?;
@ -347,8 +394,47 @@ impl WebclawMcp {
Ok(serde_json::to_string_pretty(&content_diff).unwrap_or_default()) Ok(serde_json::to_string_pretty(&content_diff).unwrap_or_default())
} }
SmartFetchResult::Cloud(resp) => { SmartFetchResult::Cloud(resp) => {
// Can't do local diff with cloud content, return the cloud response directly // Extract markdown from the cloud response and build a minimal
Ok(serde_json::to_string_pretty(&resp).unwrap_or_default()) // ExtractionResult so we can compute the diff locally.
let markdown = resp
.get("markdown")
.and_then(|v| v.as_str())
.unwrap_or("");
if markdown.is_empty() {
return Err(
"Cloud API fallback returned no markdown content; cannot compute diff."
.into(),
);
}
let current = webclaw_core::ExtractionResult {
content: webclaw_core::Content {
markdown: markdown.to_string(),
plain_text: markdown.to_string(),
links: Vec::new(),
images: Vec::new(),
code_blocks: Vec::new(),
raw_html: None,
},
metadata: webclaw_core::Metadata {
title: None,
description: None,
author: None,
published_date: None,
language: None,
url: Some(params.url.clone()),
site_name: None,
image: None,
favicon: None,
word_count: markdown.split_whitespace().count(),
},
domain_data: None,
structured_data: Vec::new(),
};
let content_diff = webclaw_core::diff::diff(&previous, &current);
Ok(serde_json::to_string_pretty(&content_diff).unwrap_or_default())
} }
} }
} }
@ -357,11 +443,12 @@ impl WebclawMcp {
/// Automatically falls back to the webclaw cloud API when bot protection is detected. /// Automatically falls back to the webclaw cloud API when bot protection is detected.
#[tool] #[tool]
async fn brand(&self, Parameters(params): Parameters<BrandParams>) -> Result<String, String> { async fn brand(&self, Parameters(params): Parameters<BrandParams>) -> Result<String, String> {
let fetch_result = self validate_url(&params.url)?;
.fetch_client let fetch_result =
.fetch(&params.url) tokio::time::timeout(LOCAL_FETCH_TIMEOUT, self.fetch_client.fetch(&params.url))
.await .await
.map_err(|e| format!("Fetch failed: {e}"))?; .map_err(|_| format!("Fetch timed out after 30s for {}", params.url))?
.map_err(|e| format!("Fetch failed: {e}"))?;
// Check for bot protection before extracting brand // Check for bot protection before extracting brand
if cloud::is_bot_protected(&fetch_result.html, &fetch_result.headers) { if cloud::is_bot_protected(&fetch_result.html, &fetch_result.headers) {
@ -415,9 +502,9 @@ impl WebclawMcp {
info!(job_id = %job_id, "research job started, polling for completion"); info!(job_id = %job_id, "research job started, polling for completion");
// Poll until completed or failed // Poll until completed or failed, with a max iteration cap (~10 minutes)
loop { for poll in 0..RESEARCH_MAX_POLLS {
tokio::time::sleep(std::time::Duration::from_secs(3)).await; tokio::time::sleep(Duration::from_secs(3)).await;
let status_resp = cloud.get(&format!("research/{job_id}")).await?; let status_resp = cloud.get(&format!("research/{job_id}")).await?;
let status = status_resp let status = status_resp
@ -445,10 +532,17 @@ impl WebclawMcp {
return Err(format!("Research job failed: {error}")); return Err(format!("Research job failed: {error}"));
} }
_ => { _ => {
// Still processing, continue polling if poll % 20 == 19 {
info!(job_id = %job_id, poll, "research still in progress...");
}
} }
} }
} }
Err(format!(
"Research job {job_id} timed out after ~10 minutes of polling. \
Check status manually via the webclaw API: GET /v1/research/{job_id}"
))
} }
/// Search the web for a query and return structured results. Requires WEBCLAW_API_KEY. /// Search the web for a query and return structured results. Requires WEBCLAW_API_KEY.
@ -498,7 +592,7 @@ impl WebclawMcp {
impl ServerHandler for WebclawMcp { impl ServerHandler for WebclawMcp {
fn get_info(&self) -> ServerInfo { fn get_info(&self) -> ServerInfo {
ServerInfo::new(ServerCapabilities::builder().enable_tools().build()) ServerInfo::new(ServerCapabilities::builder().enable_tools().build())
.with_server_info(Implementation::from_build_env()) .with_server_info(Implementation::new("webclaw-mcp", env!("CARGO_PKG_VERSION")))
.with_instructions(String::from( .with_instructions(String::from(
"Webclaw MCP server -- web content extraction for AI agents. \ "Webclaw MCP server -- web content extraction for AI agents. \
Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search.", Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search.",