From ea9c783bc515be185ced912ee3913dd27dbf5a06 Mon Sep 17 00:00:00 2001
From: Valerio <massimianivalerio1@gmail.com>
Date: Tue, 24 Mar 2026 17:25:05 +0100
Subject: [PATCH] =?UTF-8?q?fix:=20v0.1.1=20=E2=80=94=20MCP=20identity,=20t?=
 =?UTF-8?q?imeouts,=20exit=20codes,=20URL=20validation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Critical:
- MCP server identifies as "webclaw-mcp" instead of "rmcp"
- Research tool poll loop capped at 200 iterations (~10 min)

CLI:
- Non-zero exit codes on errors
- Text format strips markdown table syntax

MCP server:
- URL validation on all tools
- 60s cloud API timeout, 30s local fetch timeout
- Diff cloud fallback computes actual diff
- Batch capped at 100 URLs, crawl at 500 pages
- Graceful startup failure instead of panic

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                        |  18 ++++
 Cargo.lock                          |  13 +--
 Cargo.toml                          |   2 +-
 crates/webclaw-cli/src/main.rs      |  12 ++-
 crates/webclaw-core/src/markdown.rs |  42 +++++++++-
 crates/webclaw-mcp/Cargo.toml       |   1 +
 crates/webclaw-mcp/src/cloud.rs     |  16 ++--
 crates/webclaw-mcp/src/server.rs    | 124 ++++++++++++++++++++++++----
 8 files changed, 194 insertions(+), 34 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f4da1d8..baba179 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,24 @@
 All notable changes to webclaw are documented here.
 Format follows [Keep a Changelog](https://keepachangelog.com/).
 
+## [0.1.1] — 2026-03-24
+
+### Fixed
+- MCP server now identifies as `webclaw-mcp` instead of `rmcp` in the MCP handshake
+- Research tool polling caps at 200 iterations (~10 min) instead of looping forever
+- CLI returns non-zero exit codes on errors (invalid format, fetch failures, missing LLM)
+- Text format output strips markdown table syntax (`| --- |` pipes)
+- All MCP tools validate URLs before network calls with clear error messages
+- Cloud API HTTP client has 60s timeout instead of no timeout
+- Local fetch calls timeout after 30s to prevent hanging on slow servers
+- Diff cloud fallback computes actual diff instead of returning raw scrape JSON
+- FetchClient startup failure logs and exits gracefully instead of panicking
+
+### Added
+- Upper bounds: batch capped at 100 URLs, crawl capped at 500 pages
+
+---
+
 ## [0.1.0] — 2026-03-18
 
 First public release. Full-featured web content extraction toolkit for LLMs.
diff --git a/Cargo.lock b/Cargo.lock
index 19478f8..b4b4ed8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2854,7 +2854,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-cli"
-version = "0.1.0"
+version = "0.1.1"
 dependencies = [
  "clap",
  "dotenvy",
@@ -2874,7 +2874,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-core"
-version = "0.1.0"
+version = "0.1.1"
 dependencies = [
  "ego-tree",
  "once_cell",
@@ -2891,7 +2891,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-fetch"
-version = "0.1.0"
+version = "0.1.1"
 dependencies = [
  "primp",
  "quick-xml",
@@ -2909,7 +2909,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-llm"
-version = "0.1.0"
+version = "0.1.1"
 dependencies = [
  "async-trait",
  "reqwest",
@@ -2922,7 +2922,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-mcp"
-version = "0.1.0"
+version = "0.1.1"
 dependencies = [
  "dotenvy",
  "reqwest",
@@ -2933,6 +2933,7 @@ dependencies = [
  "tokio",
  "tracing",
  "tracing-subscriber",
+ "url",
  "webclaw-core",
  "webclaw-fetch",
  "webclaw-llm",
@@ -2941,7 +2942,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-pdf"
-version = "0.1.0"
+version = "0.1.1"
 dependencies = [
  "pdf-extract",
  "thiserror",
diff --git a/Cargo.toml b/Cargo.toml
index c87a9d7..76ab6a0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
 members = ["crates/*"]
 
 [workspace.package]
-version = "0.1.0"
+version = "0.1.1"
 edition = "2024"
 license = "MIT"
 repository = "https://github.com/0xMassi/webclaw"
diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs
index 8d9fb32..af5b0af 100644
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@@ -961,7 +961,11 @@ async fn run_crawl(cli: &Cli) -> Result<(), String> {
         result.total, result.ok, result.errors, result.elapsed_secs,
     );
 
-    Ok(())
+    if result.errors > 0 {
+        Err(format!("{} of {} pages failed", result.errors, result.total))
+    } else {
+        Ok(())
+    }
 }
 
 async fn run_map(cli: &Cli) -> Result<(), String> {
@@ -1023,7 +1027,11 @@ async fn run_batch(cli: &Cli, urls: &[String]) -> Result<(), String> {
         errors
     );
 
-    Ok(())
+    if errors > 0 {
+        Err(format!("{errors} of {} URLs failed", results.len()))
+    } else {
+        Ok(())
+    }
 }
 
 async fn run_diff(cli: &Cli, snapshot_path: &str) -> Result<(), String> {
diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs
index fcfda0e..91f2c6e 100644
--- a/crates/webclaw-core/src/markdown.rs
+++ b/crates/webclaw-core/src/markdown.rs
@@ -786,6 +786,9 @@ fn strip_markdown(md: &str) -> String {
     static ITALIC_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\*([^*]+)\*").unwrap());
     static CODE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"`([^`]+)`").unwrap());
     static HEADING_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?m)^#{1,6}\s+").unwrap());
+    // Table separator rows: | --- | --- | (with optional colons for alignment)
+    static TABLE_SEP_RE: Lazy<Regex> =
+        Lazy::new(|| Regex::new(r"^\|\s*:?-{2,}:?\s*(\|\s*:?-{2,}:?\s*)*\|$").unwrap());
 
     let s = IMG_RE.replace_all(md, "$1");
     let s = LINK_RE.replace_all(&s, "$1");
@@ -794,15 +797,31 @@ fn strip_markdown(md: &str) -> String {
     let s = CODE_RE.replace_all(&s, "$1");
     let s = HEADING_RE.replace_all(&s, "");
 
-    // Remove fenced code block markers
-    let mut lines: Vec<&str> = Vec::new();
+    // Remove fenced code block markers + strip table syntax
+    let mut lines: Vec<String> = Vec::new();
     let mut in_fence = false;
     for line in s.lines() {
         if line.trim_start().starts_with("```") {
             in_fence = !in_fence;
             continue;
         }
-        lines.push(line);
+
+        let trimmed = line.trim();
+
+        // Skip table separator rows (| --- | --- |)
+        if TABLE_SEP_RE.is_match(trimmed) {
+            continue;
+        }
+
+        // Convert table data rows: strip leading/trailing pipes, replace inner pipes with tabs
+        if trimmed.starts_with('|') && trimmed.ends_with('|') {
+            let inner = &trimmed[1..trimmed.len() - 1];
+            let cells: Vec<&str> = inner.split('|').map(|c| c.trim()).collect();
+            lines.push(cells.join("\t"));
+            continue;
+        }
+
+        lines.push(line.to_string());
     }
 
     lines.join("\n")
@@ -993,6 +1012,23 @@ mod tests {
         assert!(!plain.contains("["));
     }
 
+    #[test]
+    fn strips_table_syntax_from_plain_text() {
+        let html = r##"
+        <table>
+            <thead><tr><th>Name</th><th>Age</th></tr></thead>
+            <tbody><tr><td>Alice</td><td>30</td></tr></tbody>
+        </table>"##;
+        let (md, plain, _) = convert_html(html, None);
+        // Markdown should have table syntax
+        assert!(md.contains("| --- |"));
+        // Plain text should NOT have any pipe or separator syntax
+        assert!(!plain.contains("| --- |"), "separator row leaked: {plain}");
+        assert!(!plain.contains("| Name"), "pipe syntax leaked: {plain}");
+        assert!(plain.contains("Name"), "table content missing: {plain}");
+        assert!(plain.contains("Alice"), "table content missing: {plain}");
+    }
+
     #[test]
     fn nested_list() {
         let html = r##"
diff --git a/crates/webclaw-mcp/Cargo.toml b/crates/webclaw-mcp/Cargo.toml
index 96e4776..acebfc7 100644
--- a/crates/webclaw-mcp/Cargo.toml
+++ b/crates/webclaw-mcp/Cargo.toml
@@ -23,3 +23,4 @@ tokio = { workspace = true }
 tracing = { workspace = true }
 tracing-subscriber = { workspace = true }
 reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
+url = "2"
diff --git a/crates/webclaw-mcp/src/cloud.rs b/crates/webclaw-mcp/src/cloud.rs
index 565a172..bf05ffe 100644
--- a/crates/webclaw-mcp/src/cloud.rs
+++ b/crates/webclaw-mcp/src/cloud.rs
@@ -3,6 +3,7 @@
 /// When local fetch returns a challenge page, this module retries
 /// via api.webclaw.io. Requires WEBCLAW_API_KEY to be set.
 use std::collections::HashMap;
+use std::time::Duration;
 
 use serde_json::{Value, json};
 use tracing::info;
@@ -23,10 +24,11 @@ impl CloudClient {
         if key.is_empty() {
             return None;
         }
-        Some(Self {
-            api_key: key,
-            http: reqwest::Client::new(),
-        })
+        let http = reqwest::Client::builder()
+            .timeout(Duration::from_secs(60))
+            .build()
+            .unwrap_or_default();
+        Some(Self { api_key: key, http })
     }
 
     /// Scrape a URL via the cloud API. Returns the response JSON.
@@ -208,10 +210,10 @@ pub async fn smart_fetch(
     only_main_content: bool,
     formats: &[&str],
 ) -> Result<SmartFetchResult, String> {
-    // Step 1: Try local fetch
-    let fetch_result = client
-        .fetch(url)
+    // Step 1: Try local fetch (with timeout to avoid hanging on slow servers)
+    let fetch_result = tokio::time::timeout(Duration::from_secs(30), client.fetch(url))
         .await
+        .map_err(|_| format!("Fetch timed out after 30s for {url}"))?
         .map_err(|e| format!("Fetch failed: {e}"))?;
 
     // Step 2: Check for bot protection
diff --git a/crates/webclaw-mcp/src/server.rs b/crates/webclaw-mcp/src/server.rs
index 788d1dd..e82e94f 100644
--- a/crates/webclaw-mcp/src/server.rs
+++ b/crates/webclaw-mcp/src/server.rs
@@ -5,13 +5,15 @@
 /// to the webclaw cloud API (api.webclaw.io) when bot protection or
 /// JS rendering is detected. Set WEBCLAW_API_KEY for automatic fallback.
 use std::sync::Arc;
+use std::time::Duration;
 
 use rmcp::handler::server::router::tool::ToolRouter;
 use rmcp::handler::server::wrapper::Parameters;
 use rmcp::model::{Implementation, ServerCapabilities, ServerInfo};
 use rmcp::{ServerHandler, tool, tool_handler, tool_router};
 use serde_json::json;
-use tracing::{info, warn};
+use tracing::{error, info, warn};
+use url::Url;
 
 use crate::cloud::{self, CloudClient, SmartFetchResult};
 use crate::tools::*;
@@ -32,6 +34,27 @@ fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile {
     }
 }
 
+/// Validate that a URL is non-empty and has an http or https scheme.
+fn validate_url(url: &str) -> Result<(), String> {
+    if url.is_empty() {
+        return Err("Invalid URL: must not be empty".into());
+    }
+    match Url::parse(url) {
+        Ok(parsed) if parsed.scheme() == "http" || parsed.scheme() == "https" => Ok(()),
+        Ok(parsed) => Err(format!(
+            "Invalid URL: scheme '{}' not allowed, must start with http:// or https://",
+            parsed.scheme()
+        )),
+        Err(e) => Err(format!("Invalid URL: {e}. Must start with http:// or https://")),
+    }
+}
+
+/// Timeout for local fetch calls (prevents hanging on tarpitting servers).
+const LOCAL_FETCH_TIMEOUT: Duration = Duration::from_secs(30);
+
+/// Maximum poll iterations for research jobs (~10 minutes at 3s intervals).
+const RESEARCH_MAX_POLLS: u32 = 200;
+
 #[tool_router]
 impl WebclawMcp {
     pub async fn new() -> Self {
@@ -46,8 +69,13 @@ impl WebclawMcp {
             config.proxy_pool = pool;
         }
 
-        let fetch_client =
-            webclaw_fetch::FetchClient::new(config).expect("failed to build FetchClient");
+        let fetch_client = match webclaw_fetch::FetchClient::new(config) {
+            Ok(client) => client,
+            Err(e) => {
+                error!("failed to build FetchClient: {e}");
+                std::process::exit(1);
+            }
+        };
 
         let chain = webclaw_llm::ProviderChain::default().await;
         let llm_chain = if chain.is_empty() {
@@ -94,6 +122,7 @@ impl WebclawMcp {
     /// Automatically falls back to the webclaw cloud API when bot protection or JS rendering is detected.
     #[tool]
     async fn scrape(&self, Parameters(params): Parameters<ScrapeParams>) -> Result<String, String> {
+        validate_url(&params.url)?;
         let format = params.format.as_deref().unwrap_or("markdown");
         let browser = parse_browser(params.browser.as_deref());
         let include = params.include_selectors.unwrap_or_default();
@@ -158,6 +187,14 @@ impl WebclawMcp {
     /// Crawl a website starting from a seed URL, following links breadth-first up to a configurable depth and page limit.
     #[tool]
     async fn crawl(&self, Parameters(params): Parameters<CrawlParams>) -> Result<String, String> {
+        validate_url(&params.url)?;
+
+        if let Some(max) = params.max_pages {
+            if max > 500 {
+                return Err("max_pages cannot exceed 500".into());
+            }
+        }
+
         let format = params.format.as_deref().unwrap_or("markdown");
 
         let config = webclaw_fetch::CrawlConfig {
@@ -199,6 +236,7 @@ impl WebclawMcp {
     /// Discover URLs from a website's sitemaps (robots.txt + sitemap.xml).
     #[tool]
     async fn map(&self, Parameters(params): Parameters<MapParams>) -> Result<String, String> {
+        validate_url(&params.url)?;
         let entries = webclaw_fetch::sitemap::discover(&self.fetch_client, &params.url)
             .await
             .map_err(|e| format!("Sitemap discovery failed: {e}"))?;
@@ -217,6 +255,12 @@ impl WebclawMcp {
         if params.urls.is_empty() {
             return Err("urls must not be empty".into());
         }
+        if params.urls.len() > 100 {
+            return Err("batch is limited to 100 URLs per request".into());
+        }
+        for u in &params.urls {
+            validate_url(u)?;
+        }
 
         let format = params.format.as_deref().unwrap_or("markdown");
         let concurrency = params.concurrency.unwrap_or(5);
@@ -257,6 +301,7 @@ impl WebclawMcp {
         &self,
         Parameters(params): Parameters<ExtractParams>,
     ) -> Result<String, String> {
+        validate_url(&params.url)?;
         let chain = self.llm_chain.as_ref().ok_or(
             "No LLM providers available. Set OPENAI_API_KEY or ANTHROPIC_API_KEY, or run Ollama locally.",
         )?;
@@ -301,6 +346,7 @@ impl WebclawMcp {
         &self,
         Parameters(params): Parameters<SummarizeParams>,
     ) -> Result<String, String> {
+        validate_url(&params.url)?;
         let chain = self.llm_chain.as_ref().ok_or(
             "No LLM providers available. Set OPENAI_API_KEY or ANTHROPIC_API_KEY, or run Ollama locally.",
         )?;
@@ -326,6 +372,7 @@ impl WebclawMcp {
     /// Automatically falls back to the webclaw cloud API when bot protection is detected.
     #[tool]
     async fn diff(&self, Parameters(params): Parameters<DiffParams>) -> Result<String, String> {
+        validate_url(&params.url)?;
         let previous: webclaw_core::ExtractionResult =
             serde_json::from_str(&params.previous_snapshot)
                 .map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?;
@@ -347,8 +394,47 @@ impl WebclawMcp {
                 Ok(serde_json::to_string_pretty(&content_diff).unwrap_or_default())
             }
             SmartFetchResult::Cloud(resp) => {
-                // Can't do local diff with cloud content, return the cloud response directly
-                Ok(serde_json::to_string_pretty(&resp).unwrap_or_default())
+                // Extract markdown from the cloud response and build a minimal
+                // ExtractionResult so we can compute the diff locally.
+                let markdown = resp
+                    .get("markdown")
+                    .and_then(|v| v.as_str())
+                    .unwrap_or("");
+
+                if markdown.is_empty() {
+                    return Err(
+                        "Cloud API fallback returned no markdown content; cannot compute diff."
+                            .into(),
+                    );
+                }
+
+                let current = webclaw_core::ExtractionResult {
+                    content: webclaw_core::Content {
+                        markdown: markdown.to_string(),
+                        plain_text: markdown.to_string(),
+                        links: Vec::new(),
+                        images: Vec::new(),
+                        code_blocks: Vec::new(),
+                        raw_html: None,
+                    },
+                    metadata: webclaw_core::Metadata {
+                        title: None,
+                        description: None,
+                        author: None,
+                        published_date: None,
+                        language: None,
+                        url: Some(params.url.clone()),
+                        site_name: None,
+                        image: None,
+                        favicon: None,
+                        word_count: markdown.split_whitespace().count(),
+                    },
+                    domain_data: None,
+                    structured_data: Vec::new(),
+                };
+
+                let content_diff = webclaw_core::diff::diff(&previous, &current);
+                Ok(serde_json::to_string_pretty(&content_diff).unwrap_or_default())
             }
         }
     }
@@ -357,11 +443,12 @@ impl WebclawMcp {
     /// Automatically falls back to the webclaw cloud API when bot protection is detected.
     #[tool]
     async fn brand(&self, Parameters(params): Parameters<BrandParams>) -> Result<String, String> {
-        let fetch_result = self
-            .fetch_client
-            .fetch(&params.url)
-            .await
-            .map_err(|e| format!("Fetch failed: {e}"))?;
+        validate_url(&params.url)?;
+        let fetch_result =
+            tokio::time::timeout(LOCAL_FETCH_TIMEOUT, self.fetch_client.fetch(&params.url))
+                .await
+                .map_err(|_| format!("Fetch timed out after 30s for {}", params.url))?
+                .map_err(|e| format!("Fetch failed: {e}"))?;
 
         // Check for bot protection before extracting brand
         if cloud::is_bot_protected(&fetch_result.html, &fetch_result.headers) {
@@ -415,9 +502,9 @@ impl WebclawMcp {
 
         info!(job_id = %job_id, "research job started, polling for completion");
 
-        // Poll until completed or failed
-        loop {
-            tokio::time::sleep(std::time::Duration::from_secs(3)).await;
+        // Poll until completed or failed, with a max iteration cap (~10 minutes)
+        for poll in 0..RESEARCH_MAX_POLLS {
+            tokio::time::sleep(Duration::from_secs(3)).await;
 
             let status_resp = cloud.get(&format!("research/{job_id}")).await?;
             let status = status_resp
@@ -445,10 +532,17 @@ impl WebclawMcp {
                     return Err(format!("Research job failed: {error}"));
                 }
                 _ => {
-                    // Still processing, continue polling
+                    if poll % 20 == 19 {
+                        info!(job_id = %job_id, poll, "research still in progress...");
+                    }
                 }
             }
         }
+
+        Err(format!(
+            "Research job {job_id} timed out after ~10 minutes of polling. \
+             Check status manually via the webclaw API: GET /v1/research/{job_id}"
+        ))
     }
 
     /// Search the web for a query and return structured results. Requires WEBCLAW_API_KEY.
@@ -498,7 +592,7 @@ impl WebclawMcp {
 impl ServerHandler for WebclawMcp {
     fn get_info(&self) -> ServerInfo {
         ServerInfo::new(ServerCapabilities::builder().enable_tools().build())
-            .with_server_info(Implementation::from_build_env())
+            .with_server_info(Implementation::new("webclaw-mcp", env!("CARGO_PKG_VERSION")))
             .with_instructions(String::from(
                 "Webclaw MCP server -- web content extraction for AI agents. \
                  Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search.",