chore: rebrand webclaw to noxa

2026-05-13 17:02:36 +02:00 · 2026-04-11 00:10:38 -04:00 · 2026-04-11 00:10:38 -04:00 · 8674b60b4e
commit 8674b60b4e
parent a4c351d5ae
86 changed files with 781 additions and 2121 deletions
--- a/crates/noxa-mcp/src/cloud.rs
+++ b/crates/noxa-mcp/src/cloud.rs
@ -0,0 +1,303 @@
+/// Cloud API fallback for protected sites.
+///
+/// When local fetch returns a challenge page, this module retries
+/// via api.noxa.io. Requires NOXA_API_KEY to be set.
+use std::time::Duration;
+
+use serde_json::{Value, json};
+use tracing::info;
+
+
+const API_BASE: &str = "https://api.noxa.io/v1";
+
+/// Lightweight client for the noxa cloud API.
+pub struct CloudClient {
+    api_key: String,
+    http: reqwest::Client,
+}
+
+impl CloudClient {
+    /// Create a new cloud client from NOXA_API_KEY env var.
+    /// Returns None if the key is not set.
+    pub fn from_env() -> Option<Self> {
+        let key = std::env::var("NOXA_API_KEY").ok()?;
+        if key.is_empty() {
+            return None;
+        }
+        let http = reqwest::Client::builder()
+            .timeout(Duration::from_secs(60))
+            .build()
+            .unwrap_or_default();
+        Some(Self { api_key: key, http })
+    }
+
+    /// Scrape a URL via the cloud API. Returns the response JSON.
+    pub async fn scrape(
+        &self,
+        url: &str,
+        formats: &[&str],
+        include_selectors: &[String],
+        exclude_selectors: &[String],
+        only_main_content: bool,
+    ) -> Result<Value, String> {
+        let mut body = json!({
+            "url": url,
+            "formats": formats,
+        });
+
+        if only_main_content {
+            body["only_main_content"] = json!(true);
+        }
+        if !include_selectors.is_empty() {
+            body["include_selectors"] = json!(include_selectors);
+        }
+        if !exclude_selectors.is_empty() {
+            body["exclude_selectors"] = json!(exclude_selectors);
+        }
+
+        self.post("scrape", body).await
+    }
+
+    /// Generic POST to the cloud API.
+    pub async fn post(&self, endpoint: &str, body: Value) -> Result<Value, String> {
+        let resp = self
+            .http
+            .post(format!("{API_BASE}/{endpoint}"))
+            .header("Authorization", format!("Bearer {}", self.api_key))
+            .json(&body)
+            .send()
+            .await
+            .map_err(|e| format!("Cloud API request failed: {e}"))?;
+
+        let status = resp.status();
+        if !status.is_success() {
+            let text = resp.text().await.unwrap_or_default();
+            let truncated = truncate_error(&text);
+            return Err(format!("Cloud API error {status}: {truncated}"));
+        }
+
+        resp.json::<Value>()
+            .await
+            .map_err(|e| format!("Cloud API response parse failed: {e}"))
+    }
+
+    /// Generic GET from the cloud API.
+    pub async fn get(&self, endpoint: &str) -> Result<Value, String> {
+        let resp = self
+            .http
+            .get(format!("{API_BASE}/{endpoint}"))
+            .header("Authorization", format!("Bearer {}", self.api_key))
+            .send()
+            .await
+            .map_err(|e| format!("Cloud API request failed: {e}"))?;
+
+        let status = resp.status();
+        if !status.is_success() {
+            let text = resp.text().await.unwrap_or_default();
+            let truncated = truncate_error(&text);
+            return Err(format!("Cloud API error {status}: {truncated}"));
+        }
+
+        resp.json::<Value>()
+            .await
+            .map_err(|e| format!("Cloud API response parse failed: {e}"))
+    }
+}
+
+/// Truncate error body to avoid flooding logs with huge HTML responses.
+fn truncate_error(text: &str) -> &str {
+    const MAX_LEN: usize = 500;
+    match text.char_indices().nth(MAX_LEN) {
+        Some((byte_pos, _)) => &text[..byte_pos],
+        None => text,
+    }
+}
+
+/// Check if fetched HTML looks like a bot protection challenge page.
+/// Detects common bot protection challenge pages.
+pub fn is_bot_protected(html: &str, headers: &noxa_fetch::HeaderMap) -> bool {
+    let html_lower = html.to_lowercase();
+
+    // Cloudflare challenge page
+    if html_lower.contains("_cf_chl_opt") || html_lower.contains("challenge-platform") {
+        return true;
+    }
+
+    // Cloudflare "checking your browser" spinner
+    if (html_lower.contains("just a moment") || html_lower.contains("checking your browser"))
+        && html_lower.contains("cf-spinner")
+    {
+        return true;
+    }
+
+    // Cloudflare Turnstile (only on short pages = challenge, not embedded on real content)
+    if (html_lower.contains("cf-turnstile")
+        || html_lower.contains("challenges.cloudflare.com/turnstile"))
+        && html.len() < 100_000
+    {
+        return true;
+    }
+
+    // DataDome
+    if html_lower.contains("geo.captcha-delivery.com")
+        || html_lower.contains("captcha-delivery.com/captcha")
+    {
+        return true;
+    }
+
+    // AWS WAF
+    if html_lower.contains("awswaf-captcha") || html_lower.contains("aws-waf-client-browser") {
+        return true;
+    }
+
+    // hCaptcha blocking page
+    if html_lower.contains("hcaptcha.com")
+        && html_lower.contains("h-captcha")
+        && html.len() < 50_000
+    {
+        return true;
+    }
+
+    // Cloudflare via headers + challenge body
+    let has_cf_headers = headers.get("cf-ray").is_some() || headers.get("cf-mitigated").is_some();
+    if has_cf_headers
+        && (html_lower.contains("just a moment") || html_lower.contains("checking your browser"))
+    {
+        return true;
+    }
+
+    false
+}
+
+/// Check if a page likely needs JS rendering (SPA with almost no text content).
+pub fn needs_js_rendering(word_count: usize, html: &str) -> bool {
+    let has_scripts = html.contains("<script");
+
+    // Tier 1: almost no extractable text from a large page
+    if word_count < 50 && html.len() > 5_000 && has_scripts {
+        return true;
+    }
+
+    // Tier 2: SPA framework detected with suspiciously low content-to-HTML ratio
+    if word_count < 800 && html.len() > 50_000 && has_scripts {
+        let html_lower = html.to_lowercase();
+        let has_spa_marker = html_lower.contains("react-app")
+            || html_lower.contains("id=\"__next\"")
+            || html_lower.contains("id=\"root\"")
+            || html_lower.contains("id=\"app\"")
+            || html_lower.contains("__next_data__")
+            || html_lower.contains("nuxt")
+            || html_lower.contains("ng-app");
+
+        if has_spa_marker {
+            return true;
+        }
+    }
+
+    false
+}
+
+/// Result of a smart fetch: either local extraction or cloud API response.
+pub enum SmartFetchResult {
+    /// Successfully extracted locally.
+    Local(Box<noxa_core::ExtractionResult>),
+    /// Fell back to cloud API. Contains the API response JSON.
+    Cloud(Value),
+}
+
+/// Try local fetch first, fall back to cloud API if bot-protected or JS-rendered.
+///
+/// Returns the extraction result (local) or the cloud API response JSON.
+/// If no API key is configured and local fetch is blocked, returns an error
+/// with a helpful message.
+pub async fn smart_fetch(
+    client: &noxa_fetch::FetchClient,
+    cloud: Option<&CloudClient>,
+    url: &str,
+    include_selectors: &[String],
+    exclude_selectors: &[String],
+    only_main_content: bool,
+    formats: &[&str],
+) -> Result<SmartFetchResult, String> {
+    // Step 1: Try local fetch (with timeout to avoid hanging on slow servers)
+    let fetch_result = tokio::time::timeout(Duration::from_secs(30), client.fetch(url))
+        .await
+        .map_err(|_| format!("Fetch timed out after 30s for {url}"))?
+        .map_err(|e| format!("Fetch failed: {e}"))?;
+
+    // Step 2: Check for bot protection
+    if is_bot_protected(&fetch_result.html, &fetch_result.headers) {
+        info!(url, "bot protection detected, falling back to cloud API");
+        return cloud_fallback(
+            cloud,
+            url,
+            include_selectors,
+            exclude_selectors,
+            only_main_content,
+            formats,
+        )
+        .await;
+    }
+
+    // Step 3: Extract locally
+    let options = noxa_core::ExtractionOptions {
+        include_selectors: include_selectors.to_vec(),
+        exclude_selectors: exclude_selectors.to_vec(),
+        only_main_content,
+        include_raw_html: false,
+    };
+
+    let extraction =
+        noxa_core::extract_with_options(&fetch_result.html, Some(&fetch_result.url), &options)
+            .map_err(|e| format!("Extraction failed: {e}"))?;
+
+    // Step 4: Check for JS-rendered pages (low content from large HTML)
+    if needs_js_rendering(extraction.metadata.word_count, &fetch_result.html) {
+        info!(
+            url,
+            word_count = extraction.metadata.word_count,
+            html_len = fetch_result.html.len(),
+            "JS-rendered page detected, falling back to cloud API"
+        );
+        return cloud_fallback(
+            cloud,
+            url,
+            include_selectors,
+            exclude_selectors,
+            only_main_content,
+            formats,
+        )
+        .await;
+    }
+
+    Ok(SmartFetchResult::Local(Box::new(extraction)))
+}
+
+async fn cloud_fallback(
+    cloud: Option<&CloudClient>,
+    url: &str,
+    include_selectors: &[String],
+    exclude_selectors: &[String],
+    only_main_content: bool,
+    formats: &[&str],
+) -> Result<SmartFetchResult, String> {
+    match cloud {
+        Some(c) => {
+            let resp = c
+                .scrape(
+                    url,
+                    formats,
+                    include_selectors,
+                    exclude_selectors,
+                    only_main_content,
+                )
+                .await?;
+            info!(url, "cloud API fallback successful");
+            Ok(SmartFetchResult::Cloud(resp))
+        }
+        None => Err(format!(
+            "Bot protection detected on {url}. Set NOXA_API_KEY for automatic cloud bypass. \
+             Get a key at https://noxa.io"
+        )),
+    }
+}
--- a/crates/noxa-mcp/src/main.rs
+++ b/crates/noxa-mcp/src/main.rs
@ -0,0 +1,28 @@
+/// noxa-mcp: MCP (Model Context Protocol) server for noxa.
+/// Exposes web extraction tools over stdio transport for AI agents
+/// like Claude Desktop, Claude Code, and other MCP clients.
+mod cloud;
+mod server;
+mod tools;
+
+use rmcp::ServiceExt;
+use rmcp::transport::stdio;
+
+use server::NoxaMcp;
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    dotenvy::dotenv().ok();
+
+    // Log to stderr -- stdout is the MCP transport channel
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .with_writer(std::io::stderr)
+        .with_ansi(false)
+        .init();
+
+    let service = NoxaMcp::new().await.serve(stdio()).await?;
+
+    service.waiting().await?;
+    Ok(())
+}
--- a/crates/noxa-mcp/src/server.rs
+++ b/crates/noxa-mcp/src/server.rs
@ -0,0 +1,767 @@
+/// MCP server implementation for noxa.
+/// Exposes web extraction capabilities as tools for AI agents.
+///
+/// Uses a local-first architecture: fetches pages directly, then falls back
+/// to the noxa cloud API (api.noxa.io) when bot protection or
+/// JS rendering is detected. Set NOXA_API_KEY for automatic fallback.
+use std::sync::Arc;
+use std::time::Duration;
+
+use rmcp::handler::server::router::tool::ToolRouter;
+use rmcp::handler::server::wrapper::Parameters;
+use rmcp::model::{Implementation, ServerCapabilities, ServerInfo};
+use rmcp::{ServerHandler, tool, tool_handler, tool_router};
+use serde_json::json;
+use tracing::{error, info, warn};
+use url::Url;
+
+use crate::cloud::{self, CloudClient, SmartFetchResult};
+use crate::tools::*;
+
+pub struct NoxaMcp {
+    tool_router: ToolRouter<Self>,
+    fetch_client: Arc<noxa_fetch::FetchClient>,
+    llm_chain: Option<noxa_llm::ProviderChain>,
+    cloud: Option<CloudClient>,
+}
+
+/// Parse a browser string into a BrowserProfile.
+fn parse_browser(browser: Option<&str>) -> noxa_fetch::BrowserProfile {
+    match browser {
+        Some("firefox") => noxa_fetch::BrowserProfile::Firefox,
+        Some("random") => noxa_fetch::BrowserProfile::Random,
+        _ => noxa_fetch::BrowserProfile::Chrome,
+    }
+}
+
+/// Validate that a URL is non-empty and has an http or https scheme.
+fn validate_url(url: &str) -> Result<(), String> {
+    if url.is_empty() {
+        return Err("Invalid URL: must not be empty".into());
+    }
+    match Url::parse(url) {
+        Ok(parsed) if parsed.scheme() == "http" || parsed.scheme() == "https" => Ok(()),
+        Ok(parsed) => Err(format!(
+            "Invalid URL: scheme '{}' not allowed, must start with http:// or https://",
+            parsed.scheme()
+        )),
+        Err(e) => Err(format!(
+            "Invalid URL: {e}. Must start with http:// or https://"
+        )),
+    }
+}
+
+/// Timeout for local fetch calls (prevents hanging on tarpitting servers).
+const LOCAL_FETCH_TIMEOUT: Duration = Duration::from_secs(30);
+
+/// Maximum poll iterations for research jobs (~10 minutes at 3s intervals).
+const RESEARCH_MAX_POLLS: u32 = 200;
+
+#[tool_router]
+impl NoxaMcp {
+    pub async fn new() -> Self {
+        let mut config = noxa_fetch::FetchConfig::default();
+
+        // Load proxy config from env vars or local file
+        if let Ok(proxy) = std::env::var("NOXA_PROXY") {
+            info!("using single proxy from NOXA_PROXY");
+            config.proxy = Some(proxy);
+        }
+
+        let proxy_file = std::env::var("NOXA_PROXY_FILE")
+            .ok()
+            .unwrap_or_else(|| "proxies.txt".to_string());
+        if std::path::Path::new(&proxy_file).exists()
+            && let Ok(pool) = noxa_fetch::parse_proxy_file(&proxy_file)
+            && !pool.is_empty()
+        {
+            info!(count = pool.len(), file = %proxy_file, "loaded proxy pool");
+            config.proxy_pool = pool;
+        }
+
+        let fetch_client = match noxa_fetch::FetchClient::new(config) {
+            Ok(client) => client,
+            Err(e) => {
+                error!("failed to build FetchClient: {e}");
+                std::process::exit(1);
+            }
+        };
+
+        let chain = noxa_llm::ProviderChain::default().await;
+        let llm_chain = if chain.is_empty() {
+            warn!("no LLM providers available -- extract/summarize tools will fail");
+            None
+        } else {
+            info!(providers = chain.len(), "LLM provider chain ready");
+            Some(chain)
+        };
+
+        let cloud = CloudClient::from_env();
+        if cloud.is_some() {
+            info!("cloud API fallback enabled (NOXA_API_KEY set)");
+        } else {
+            warn!(
+                "NOXA_API_KEY not set -- bot-protected sites will return challenge pages. \
+                 Get a key at https://noxa.io"
+            );
+        }
+
+        Self {
+            tool_router: Self::tool_router(),
+            fetch_client: Arc::new(fetch_client),
+            llm_chain,
+            cloud,
+        }
+    }
+
+    /// Helper: smart fetch with LLM format for extract/summarize tools.
+    async fn smart_fetch_llm(&self, url: &str) -> Result<SmartFetchResult, String> {
+        cloud::smart_fetch(
+            &self.fetch_client,
+            self.cloud.as_ref(),
+            url,
+            &[],
+            &[],
+            false,
+            &["llm", "markdown"],
+        )
+        .await
+    }
+
+    /// Scrape a single URL and extract its content as markdown, LLM-optimized text, plain text, or full JSON.
+    /// Automatically falls back to the noxa cloud API when bot protection or JS rendering is detected.
+    #[tool]
+    async fn scrape(&self, Parameters(params): Parameters<ScrapeParams>) -> Result<String, String> {
+        validate_url(&params.url)?;
+        let format = params.format.as_deref().unwrap_or("markdown");
+        let browser = parse_browser(params.browser.as_deref());
+        let include = params.include_selectors.unwrap_or_default();
+        let exclude = params.exclude_selectors.unwrap_or_default();
+        let main_only = params.only_main_content.unwrap_or(false);
+
+        // Build cookie header from params
+        let cookie_header = params
+            .cookies
+            .as_ref()
+            .filter(|c| !c.is_empty())
+            .map(|c| c.join("; "));
+
+        // Use a custom client if non-default browser or cookies are provided
+        let is_default_browser = matches!(browser, noxa_fetch::BrowserProfile::Chrome);
+        let needs_custom = !is_default_browser || cookie_header.is_some();
+        let custom_client;
+        let client: &noxa_fetch::FetchClient = if needs_custom {
+            let mut headers = std::collections::HashMap::new();
+            headers.insert("Accept-Language".to_string(), "en-US,en;q=0.9".to_string());
+            if let Some(ref cookies) = cookie_header {
+                headers.insert("Cookie".to_string(), cookies.clone());
+            }
+            let config = noxa_fetch::FetchConfig {
+                browser,
+                headers,
+                ..Default::default()
+            };
+            custom_client = noxa_fetch::FetchClient::new(config)
+                .map_err(|e| format!("Failed to build client: {e}"))?;
+            &custom_client
+        } else {
+            &self.fetch_client
+        };
+
+        let formats = [format];
+        let result = cloud::smart_fetch(
+            client,
+            self.cloud.as_ref(),
+            &params.url,
+            &include,
+            &exclude,
+            main_only,
+            &formats,
+        )
+        .await?;
+
+        match result {
+            SmartFetchResult::Local(extraction) => {
+                let output = match format {
+                    "llm" => noxa_core::to_llm_text(&extraction, Some(&params.url)),
+                    "text" => extraction.content.plain_text,
+                    "json" => serde_json::to_string_pretty(&extraction).unwrap_or_default(),
+                    _ => extraction.content.markdown,
+                };
+                Ok(output)
+            }
+            SmartFetchResult::Cloud(resp) => {
+                // Extract the requested format from the API response
+                let content = resp
+                    .get(format)
+                    .or_else(|| resp.get("markdown"))
+                    .and_then(|v| v.as_str())
+                    .unwrap_or("");
+
+                if content.is_empty() {
+                    // Return full JSON if no content in the expected format
+                    Ok(serde_json::to_string_pretty(&resp).unwrap_or_default())
+                } else {
+                    Ok(content.to_string())
+                }
+            }
+        }
+    }
+
+    /// Crawl a website starting from a seed URL, following links breadth-first up to a configurable depth and page limit.
+    #[tool]
+    async fn crawl(&self, Parameters(params): Parameters<CrawlParams>) -> Result<String, String> {
+        validate_url(&params.url)?;
+
+        if let Some(max) = params.max_pages
+            && max > 500
+        {
+            return Err("max_pages cannot exceed 500".into());
+        }
+
+        let format = params.format.as_deref().unwrap_or("markdown");
+
+        let config = noxa_fetch::CrawlConfig {
+            max_depth: params.depth.unwrap_or(2) as usize,
+            max_pages: params.max_pages.unwrap_or(50),
+            concurrency: params.concurrency.unwrap_or(5),
+            use_sitemap: params.use_sitemap.unwrap_or(false),
+            ..Default::default()
+        };
+
+        let crawler = noxa_fetch::Crawler::new(&params.url, config)
+            .map_err(|e| format!("Crawler init failed: {e}"))?;
+
+        let result = crawler.crawl(&params.url, None).await;
+
+        let mut output = format!(
+            "Crawled {} pages ({} ok, {} errors) in {:.1}s\n\n",
+            result.total, result.ok, result.errors, result.elapsed_secs
+        );
+
+        for page in &result.pages {
+            output.push_str(&format!("--- {} (depth {}) ---\n", page.url, page.depth));
+            if let Some(ref extraction) = page.extraction {
+                let content = match format {
+                    "llm" => noxa_core::to_llm_text(extraction, Some(&page.url)),
+                    "text" => extraction.content.plain_text.clone(),
+                    _ => extraction.content.markdown.clone(),
+                };
+                output.push_str(&content);
+            } else if let Some(ref err) = page.error {
+                output.push_str(&format!("Error: {err}"));
+            }
+            output.push_str("\n\n");
+        }
+
+        Ok(output)
+    }
+
+    /// Discover URLs from a website's sitemaps (robots.txt + sitemap.xml).
+    #[tool]
+    async fn map(&self, Parameters(params): Parameters<MapParams>) -> Result<String, String> {
+        validate_url(&params.url)?;
+        let entries = noxa_fetch::sitemap::discover(&self.fetch_client, &params.url)
+            .await
+            .map_err(|e| format!("Sitemap discovery failed: {e}"))?;
+
+        let urls: Vec<&str> = entries.iter().map(|e| e.url.as_str()).collect();
+        Ok(format!(
+            "Discovered {} URLs:\n\n{}",
+            urls.len(),
+            urls.join("\n")
+        ))
+    }
+
+    /// Extract content from multiple URLs concurrently.
+    #[tool]
+    async fn batch(&self, Parameters(params): Parameters<BatchParams>) -> Result<String, String> {
+        if params.urls.is_empty() {
+            return Err("urls must not be empty".into());
+        }
+        if params.urls.len() > 100 {
+            return Err("batch is limited to 100 URLs per request".into());
+        }
+        for u in &params.urls {
+            validate_url(u)?;
+        }
+
+        let format = params.format.as_deref().unwrap_or("markdown");
+        let concurrency = params.concurrency.unwrap_or(5);
+        let url_refs: Vec<&str> = params.urls.iter().map(String::as_str).collect();
+
+        let results = self
+            .fetch_client
+            .fetch_and_extract_batch(&url_refs, concurrency)
+            .await;
+
+        let mut output = format!("Extracted {} URLs:\n\n", results.len());
+
+        for r in &results {
+            output.push_str(&format!("--- {} ---\n", r.url));
+            match &r.result {
+                Ok(extraction) => {
+                    let content = match format {
+                        "llm" => noxa_core::to_llm_text(extraction, Some(&r.url)),
+                        "text" => extraction.content.plain_text.clone(),
+                        _ => extraction.content.markdown.clone(),
+                    };
+                    output.push_str(&content);
+                }
+                Err(e) => {
+                    output.push_str(&format!("Error: {e}"));
+                }
+            }
+            output.push_str("\n\n");
+        }
+
+        Ok(output)
+    }
+
+    /// Extract structured data from a web page using an LLM. Provide either a JSON schema or a natural language prompt.
+    /// Falls back to the noxa cloud API when no local LLM is available or bot protection is detected.
+    #[tool]
+    async fn extract(
+        &self,
+        Parameters(params): Parameters<ExtractParams>,
+    ) -> Result<String, String> {
+        validate_url(&params.url)?;
+
+        if params.schema.is_none() && params.prompt.is_none() {
+            return Err("Either 'schema' or 'prompt' is required for extraction.".into());
+        }
+
+        // No local LLM — fall back to cloud API directly
+        if self.llm_chain.is_none() {
+            let cloud = self.cloud.as_ref().ok_or(
+                "No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
+            )?;
+            let mut body = json!({"url": params.url});
+            if let Some(ref schema) = params.schema {
+                body["schema"] = json!(schema);
+            }
+            if let Some(ref prompt) = params.prompt {
+                body["prompt"] = json!(prompt);
+            }
+            let resp = cloud.post("extract", body).await?;
+            return Ok(serde_json::to_string_pretty(&resp).unwrap_or_default());
+        }
+
+        let chain = self.llm_chain.as_ref().unwrap();
+
+        let llm_content = match self.smart_fetch_llm(&params.url).await? {
+            SmartFetchResult::Local(extraction) => {
+                noxa_core::to_llm_text(&extraction, Some(&params.url))
+            }
+            SmartFetchResult::Cloud(resp) => resp
+                .get("llm")
+                .or_else(|| resp.get("markdown"))
+                .and_then(|v| v.as_str())
+                .unwrap_or("")
+                .to_string(),
+        };
+
+        let data = if let Some(ref schema) = params.schema {
+            noxa_llm::extract::extract_json(&llm_content, schema, chain, None)
+                .await
+                .map_err(|e| format!("LLM extraction failed: {e}"))?
+        } else {
+            let prompt = params.prompt.as_deref().unwrap();
+            noxa_llm::extract::extract_with_prompt(&llm_content, prompt, chain, None)
+                .await
+                .map_err(|e| format!("LLM extraction failed: {e}"))?
+        };
+
+        Ok(serde_json::to_string_pretty(&data).unwrap_or_default())
+    }
+
+    /// Summarize the content of a web page using an LLM.
+    /// Falls back to the noxa cloud API when no local LLM is available or bot protection is detected.
+    #[tool]
+    async fn summarize(
+        &self,
+        Parameters(params): Parameters<SummarizeParams>,
+    ) -> Result<String, String> {
+        validate_url(&params.url)?;
+
+        // No local LLM — fall back to cloud API directly
+        if self.llm_chain.is_none() {
+            let cloud = self.cloud.as_ref().ok_or(
+                "No LLM providers available. Set OPENAI_API_KEY, ANTHROPIC_API_KEY, or NOXA_API_KEY for cloud fallback.",
+            )?;
+            let mut body = json!({"url": params.url});
+            if let Some(sentences) = params.max_sentences {
+                body["max_sentences"] = json!(sentences);
+            }
+            let resp = cloud.post("summarize", body).await?;
+            let summary = resp.get("summary").and_then(|v| v.as_str()).unwrap_or("");
+            if summary.is_empty() {
+                return Ok(serde_json::to_string_pretty(&resp).unwrap_or_default());
+            }
+            return Ok(summary.to_string());
+        }
+
+        let chain = self.llm_chain.as_ref().unwrap();
+
+        let llm_content = match self.smart_fetch_llm(&params.url).await? {
+            SmartFetchResult::Local(extraction) => {
+                noxa_core::to_llm_text(&extraction, Some(&params.url))
+            }
+            SmartFetchResult::Cloud(resp) => resp
+                .get("llm")
+                .or_else(|| resp.get("markdown"))
+                .and_then(|v| v.as_str())
+                .unwrap_or("")
+                .to_string(),
+        };
+
+        noxa_llm::summarize::summarize(&llm_content, params.max_sentences, chain, None)
+            .await
+            .map_err(|e| format!("Summarization failed: {e}"))
+    }
+
+    /// Compare the current content of a URL against a previous extraction snapshot, showing what changed.
+    /// Automatically falls back to the noxa cloud API when bot protection is detected.
+    #[tool]
+    async fn diff(&self, Parameters(params): Parameters<DiffParams>) -> Result<String, String> {
+        validate_url(&params.url)?;
+        let previous: noxa_core::ExtractionResult =
+            serde_json::from_str(&params.previous_snapshot)
+                .map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?;
+
+        let result = cloud::smart_fetch(
+            &self.fetch_client,
+            self.cloud.as_ref(),
+            &params.url,
+            &[],
+            &[],
+            false,
+            &["markdown"],
+        )
+        .await?;
+
+        match result {
+            SmartFetchResult::Local(current) => {
+                let content_diff = noxa_core::diff::diff(&previous, &current);
+                Ok(serde_json::to_string_pretty(&content_diff).unwrap_or_default())
+            }
+            SmartFetchResult::Cloud(resp) => {
+                // Extract markdown from the cloud response and build a minimal
+                // ExtractionResult so we can compute the diff locally.
+                let markdown = resp.get("markdown").and_then(|v| v.as_str()).unwrap_or("");
+
+                if markdown.is_empty() {
+                    return Err(
+                        "Cloud API fallback returned no markdown content; cannot compute diff."
+                            .into(),
+                    );
+                }
+
+                let current = noxa_core::ExtractionResult {
+                    content: noxa_core::Content {
+                        markdown: markdown.to_string(),
+                        plain_text: markdown.to_string(),
+                        links: Vec::new(),
+                        images: Vec::new(),
+                        code_blocks: Vec::new(),
+                        raw_html: None,
+                    },
+                    metadata: noxa_core::Metadata {
+                        title: None,
+                        description: None,
+                        author: None,
+                        published_date: None,
+                        language: None,
+                        url: Some(params.url.clone()),
+                        site_name: None,
+                        image: None,
+                        favicon: None,
+                        word_count: markdown.split_whitespace().count(),
+                    },
+                    domain_data: None,
+                    structured_data: Vec::new(),
+                };
+
+                let content_diff = noxa_core::diff::diff(&previous, &current);
+                Ok(serde_json::to_string_pretty(&content_diff).unwrap_or_default())
+            }
+        }
+    }
+
+    /// Extract brand identity (colors, fonts, logo, favicon) from a website's HTML and CSS.
+    /// Automatically falls back to the noxa cloud API when bot protection is detected.
+    #[tool]
+    async fn brand(&self, Parameters(params): Parameters<BrandParams>) -> Result<String, String> {
+        validate_url(&params.url)?;
+        let fetch_result =
+            tokio::time::timeout(LOCAL_FETCH_TIMEOUT, self.fetch_client.fetch(&params.url))
+                .await
+                .map_err(|_| format!("Fetch timed out after 30s for {}", params.url))?
+                .map_err(|e| format!("Fetch failed: {e}"))?;
+
+        // Check for bot protection before extracting brand
+        if cloud::is_bot_protected(&fetch_result.html, &fetch_result.headers) {
+            if let Some(ref c) = self.cloud {
+                let resp = c
+                    .post("brand", serde_json::json!({"url": params.url}))
+                    .await?;
+                return Ok(serde_json::to_string_pretty(&resp).unwrap_or_default());
+            } else {
+                return Err(format!(
+                    "Bot protection detected on {}. Set NOXA_API_KEY for automatic cloud bypass. \
+                     Get a key at https://noxa.io",
+                    params.url
+                ));
+            }
+        }
+
+        let identity =
+            noxa_core::brand::extract_brand(&fetch_result.html, Some(&fetch_result.url));
+
+        Ok(serde_json::to_string_pretty(&identity).unwrap_or_default())
+    }
+
+    /// Run a deep research investigation on a topic or question. Requires NOXA_API_KEY.
+    /// Saves full result to ~/.noxa/research/ and returns the file path + key findings.
+    /// Checks cache first — same query returns the cached result without spending credits.
+    #[tool]
+    async fn research(
+        &self,
+        Parameters(params): Parameters<ResearchParams>,
+    ) -> Result<String, String> {
+        let cloud = self
+            .cloud
+            .as_ref()
+            .ok_or("Research requires NOXA_API_KEY. Get a key at https://noxa.io")?;
+
+        let research_dir = research_dir();
+        let slug = slugify(&params.query);
+
+        // Check cache first
+        if let Some(cached) = load_cached_research(&research_dir, &slug) {
+            info!(query = %params.query, "returning cached research");
+            return Ok(cached);
+        }
+
+        let mut body = json!({ "query": params.query });
+        if let Some(deep) = params.deep {
+            body["deep"] = json!(deep);
+        }
+        if let Some(ref topic) = params.topic {
+            body["topic"] = json!(topic);
+        }
+
+        // Start the research job
+        let start_resp = cloud.post("research", body).await?;
+        let job_id = start_resp
+            .get("id")
+            .and_then(|v| v.as_str())
+            .ok_or("Research API did not return a job ID")?
+            .to_string();
+
+        info!(job_id = %job_id, "research job started, polling for completion");
+
+        // Poll until completed or failed
+        for poll in 0..RESEARCH_MAX_POLLS {
+            tokio::time::sleep(Duration::from_secs(3)).await;
+
+            let status_resp = cloud.get(&format!("research/{job_id}")).await?;
+            let status = status_resp
+                .get("status")
+                .and_then(|v| v.as_str())
+                .unwrap_or("unknown");
+
+            match status {
+                "completed" => {
+                    // Save full result to file
+                    let (report_path, json_path) =
+                        save_research(&research_dir, &slug, &status_resp);
+
+                    // Build compact response: file paths + findings (no full report)
+                    let sources_count = status_resp
+                        .get("sources_count")
+                        .and_then(|v| v.as_i64())
+                        .unwrap_or(0);
+                    let findings_count = status_resp
+                        .get("findings_count")
+                        .and_then(|v| v.as_i64())
+                        .unwrap_or(0);
+
+                    let mut response = json!({
+                        "status": "completed",
+                        "query": params.query,
+                        "report_file": report_path,
+                        "json_file": json_path,
+                        "sources_count": sources_count,
+                        "findings_count": findings_count,
+                    });
+
+                    if let Some(findings) = status_resp.get("findings") {
+                        response["findings"] = findings.clone();
+                    }
+                    if let Some(sources) = status_resp.get("sources") {
+                        response["sources"] = sources.clone();
+                    }
+
+                    return Ok(serde_json::to_string_pretty(&response).unwrap_or_default());
+                }
+                "failed" => {
+                    let error = status_resp
+                        .get("error")
+                        .and_then(|v| v.as_str())
+                        .unwrap_or("unknown error");
+                    return Err(format!("Research job failed: {error}"));
+                }
+                _ => {
+                    if poll % 20 == 19 {
+                        info!(job_id = %job_id, poll, "research still in progress...");
+                    }
+                }
+            }
+        }
+
+        Err(format!(
+            "Research job {job_id} timed out after ~10 minutes of polling. \
+             Check status manually via the noxa API: GET /v1/research/{job_id}"
+        ))
+    }
+
+    /// Search the web for a query and return structured results. Requires NOXA_API_KEY.
+    #[tool]
+    async fn search(&self, Parameters(params): Parameters<SearchParams>) -> Result<String, String> {
+        let cloud = self
+            .cloud
+            .as_ref()
+            .ok_or("Search requires NOXA_API_KEY. Get a key at https://noxa.io")?;
+
+        let mut body = json!({ "query": params.query });
+        if let Some(num) = params.num_results {
+            body["num_results"] = json!(num);
+        }
+
+        let resp = cloud.post("search", body).await?;
+
+        // Format results for readability
+        if let Some(results) = resp.get("results").and_then(|v| v.as_array()) {
+            let mut output = format!("Found {} results:\n\n", results.len());
+            for (i, result) in results.iter().enumerate() {
+                let title = result.get("title").and_then(|v| v.as_str()).unwrap_or("");
+                let url = result.get("url").and_then(|v| v.as_str()).unwrap_or("");
+                let snippet = result
+                    .get("snippet")
+                    .or_else(|| result.get("description"))
+                    .and_then(|v| v.as_str())
+                    .unwrap_or("");
+
+                output.push_str(&format!(
+                    "{}. {}\n   {}\n   {}\n\n",
+                    i + 1,
+                    title,
+                    url,
+                    snippet
+                ));
+            }
+            Ok(output)
+        } else {
+            // Fallback: return raw JSON if unexpected shape
+            Ok(serde_json::to_string_pretty(&resp).unwrap_or_default())
+        }
+    }
+}
+
+#[tool_handler]
+impl ServerHandler for NoxaMcp {
+    fn get_info(&self) -> ServerInfo {
+        ServerInfo::new(ServerCapabilities::builder().enable_tools().build())
+            .with_server_info(Implementation::new("noxa-mcp", env!("CARGO_PKG_VERSION")))
+            .with_instructions(String::from(
+                "Noxa MCP server -- web content extraction for AI agents. \
+                 Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search.",
+            ))
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Research file helpers
+// ---------------------------------------------------------------------------
+
+fn research_dir() -> std::path::PathBuf {
+    let dir = dirs::home_dir()
+        .unwrap_or_else(|| std::path::PathBuf::from("."))
+        .join(".noxa")
+        .join("research");
+    std::fs::create_dir_all(&dir).ok();
+    dir
+}
+
+fn slugify(query: &str) -> String {
+    let s: String = query
+        .chars()
+        .map(|c| {
+            if c.is_alphanumeric() || c == ' ' {
+                c
+            } else {
+                ' '
+            }
+        })
+        .collect::<String>()
+        .split_whitespace()
+        .collect::<Vec<_>>()
+        .join("-")
+        .to_lowercase();
+    if s.len() > 60 { s[..60].to_string() } else { s }
+}
+
+/// Check for a cached research result. Returns the compact response if found.
+fn load_cached_research(dir: &std::path::Path, slug: &str) -> Option<String> {
+    let json_path = dir.join(format!("{slug}.json"));
+    let report_path = dir.join(format!("{slug}.md"));
+
+    if !json_path.exists() || !report_path.exists() {
+        return None;
+    }
+
+    let json_str = std::fs::read_to_string(&json_path).ok()?;
+    let data: serde_json::Value = serde_json::from_str(&json_str).ok()?;
+
+    // Build compact response from cache
+    let mut response = json!({
+        "status": "completed",
+        "cached": true,
+        "query": data.get("query").cloned().unwrap_or(json!("")),
+        "report_file": report_path.to_string_lossy(),
+        "json_file": json_path.to_string_lossy(),
+        "sources_count": data.get("sources_count").cloned().unwrap_or(json!(0)),
+        "findings_count": data.get("findings_count").cloned().unwrap_or(json!(0)),
+    });
+
+    if let Some(findings) = data.get("findings") {
+        response["findings"] = findings.clone();
+    }
+    if let Some(sources) = data.get("sources") {
+        response["sources"] = sources.clone();
+    }
+
+    Some(serde_json::to_string_pretty(&response).unwrap_or_default())
+}
+
+/// Save research result to disk. Returns (report_path, json_path) as strings.
+fn save_research(dir: &std::path::Path, slug: &str, data: &serde_json::Value) -> (String, String) {
+    let json_path = dir.join(format!("{slug}.json"));
+    let report_path = dir.join(format!("{slug}.md"));
+
+    // Save full JSON
+    if let Ok(json_str) = serde_json::to_string_pretty(data) {
+        std::fs::write(&json_path, json_str).ok();
+    }
+
+    // Save report as markdown
+    if let Some(report) = data.get("report").and_then(|v| v.as_str()) {
+        std::fs::write(&report_path, report).ok();
+    }
+
+    (
+        report_path.to_string_lossy().to_string(),
+        json_path.to_string_lossy().to_string(),
+    )
+}
--- a/crates/noxa-mcp/src/tools.rs
+++ b/crates/noxa-mcp/src/tools.rs
@ -0,0 +1,105 @@
+/// Tool parameter structs for MCP tool inputs.
+/// Each struct derives JsonSchema for automatic schema generation,
+/// and Deserialize for parsing from MCP tool call arguments.
+use schemars::JsonSchema;
+use serde::Deserialize;
+
+#[derive(Debug, Deserialize, JsonSchema)]
+pub struct ScrapeParams {
+    /// URL to scrape
+    pub url: String,
+    /// Output format: "markdown" (default), "llm", "text", or "json"
+    pub format: Option<String>,
+    /// CSS selectors to include (only extract matching elements)
+    pub include_selectors: Option<Vec<String>>,
+    /// CSS selectors to exclude from output
+    pub exclude_selectors: Option<Vec<String>>,
+    /// If true, extract only the main content (article/main element)
+    pub only_main_content: Option<bool>,
+    /// Browser profile: "chrome" (default), "firefox", or "random"
+    pub browser: Option<String>,
+    /// Cookies to send with the request (e.g. ["name=value", "session=abc123"])
+    pub cookies: Option<Vec<String>>,
+}
+
+#[derive(Debug, Deserialize, JsonSchema)]
+pub struct CrawlParams {
+    /// Seed URL to start crawling from
+    pub url: String,
+    /// Maximum link depth to follow (default: 2)
+    pub depth: Option<u32>,
+    /// Maximum number of pages to crawl (default: 50)
+    pub max_pages: Option<usize>,
+    /// Number of concurrent requests (default: 5)
+    pub concurrency: Option<usize>,
+    /// Seed the frontier from sitemap discovery before crawling
+    pub use_sitemap: Option<bool>,
+    /// Output format for each page: "markdown" (default), "llm", "text"
+    pub format: Option<String>,
+}
+
+#[derive(Debug, Deserialize, JsonSchema)]
+pub struct MapParams {
+    /// Base URL to discover sitemaps from (e.g. `<https://example.com>`)
+    pub url: String,
+}
+
+#[derive(Debug, Deserialize, JsonSchema)]
+pub struct BatchParams {
+    /// List of URLs to extract content from
+    pub urls: Vec<String>,
+    /// Output format: "markdown" (default), "llm", "text"
+    pub format: Option<String>,
+    /// Number of concurrent requests (default: 5)
+    pub concurrency: Option<usize>,
+}
+
+#[derive(Debug, Deserialize, JsonSchema)]
+pub struct ExtractParams {
+    /// URL to fetch and extract structured data from
+    pub url: String,
+    /// Natural language prompt describing what to extract
+    pub prompt: Option<String>,
+    /// JSON schema describing the structure to extract
+    pub schema: Option<serde_json::Value>,
+}
+
+#[derive(Debug, Deserialize, JsonSchema)]
+pub struct SummarizeParams {
+    /// URL to fetch and summarize
+    pub url: String,
+    /// Number of sentences in the summary (default: 3)
+    pub max_sentences: Option<usize>,
+}
+
+#[derive(Debug, Deserialize, JsonSchema)]
+pub struct DiffParams {
+    /// URL to fetch current content from
+    pub url: String,
+    /// Previous extraction snapshot as a JSON string (ExtractionResult)
+    pub previous_snapshot: String,
+}
+
+#[derive(Debug, Deserialize, JsonSchema)]
+pub struct BrandParams {
+    /// URL to extract brand identity from
+    pub url: String,
+}
+
+#[derive(Debug, Deserialize, JsonSchema)]
+pub struct ResearchParams {
+    /// Research query or question to investigate
+    pub query: String,
+    /// Enable deep research mode for more thorough investigation (default: false)
+    pub deep: Option<bool>,
+    /// Topic hint to guide research focus (e.g. "technology", "finance", "science")
+    pub topic: Option<String>,
+}
+
+#[derive(Debug, Deserialize, JsonSchema)]
+pub struct SearchParams {
+    /// Search query
+    pub query: String,
+    /// Number of results to return (default: 10)
+    pub num_results: Option<u32>,
+}