Initial release: webclaw v0.1.0 — web content extraction for LLMs

CLI + MCP server for extracting clean, structured content from any URL.
6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats.

MIT Licensed | https://webclaw.io
This commit is contained in:
Valerio 2026-03-23 18:31:11 +01:00
commit c99ec684fa
79 changed files with 24074 additions and 0 deletions

View file

@ -0,0 +1,292 @@
/// Cloud API fallback for protected sites.
///
/// When local fetch returns a challenge page, this module retries
/// via api.webclaw.io. Requires WEBCLAW_API_KEY to be set.
use std::collections::HashMap;
use serde_json::{Value, json};
use tracing::info;
const API_BASE: &str = "https://api.webclaw.io/v1";
/// Lightweight client for the webclaw cloud API.
pub struct CloudClient {
api_key: String,
http: reqwest::Client,
}
impl CloudClient {
/// Create a new cloud client from WEBCLAW_API_KEY env var.
/// Returns None if the key is not set.
pub fn from_env() -> Option<Self> {
let key = std::env::var("WEBCLAW_API_KEY").ok()?;
if key.is_empty() {
return None;
}
Some(Self {
api_key: key,
http: reqwest::Client::new(),
})
}
/// Scrape a URL via the cloud API. Returns the response JSON.
pub async fn scrape(
&self,
url: &str,
formats: &[&str],
include_selectors: &[String],
exclude_selectors: &[String],
only_main_content: bool,
) -> Result<Value, String> {
let mut body = json!({
"url": url,
"formats": formats,
});
if only_main_content {
body["only_main_content"] = json!(true);
}
if !include_selectors.is_empty() {
body["include_selectors"] = json!(include_selectors);
}
if !exclude_selectors.is_empty() {
body["exclude_selectors"] = json!(exclude_selectors);
}
self.post("scrape", body).await
}
/// Generic POST to the cloud API.
pub async fn post(&self, endpoint: &str, body: Value) -> Result<Value, String> {
let resp = self
.http
.post(format!("{API_BASE}/{endpoint}"))
.header("Authorization", format!("Bearer {}", self.api_key))
.json(&body)
.send()
.await
.map_err(|e| format!("Cloud API request failed: {e}"))?;
let status = resp.status();
if !status.is_success() {
let text = resp.text().await.unwrap_or_default();
return Err(format!("Cloud API error {status}: {text}"));
}
resp.json::<Value>()
.await
.map_err(|e| format!("Cloud API response parse failed: {e}"))
}
/// Generic GET from the cloud API.
pub async fn get(&self, endpoint: &str) -> Result<Value, String> {
let resp = self
.http
.get(format!("{API_BASE}/{endpoint}"))
.header("Authorization", format!("Bearer {}", self.api_key))
.send()
.await
.map_err(|e| format!("Cloud API request failed: {e}"))?;
let status = resp.status();
if !status.is_success() {
let text = resp.text().await.unwrap_or_default();
return Err(format!("Cloud API error {status}: {text}"));
}
resp.json::<Value>()
.await
.map_err(|e| format!("Cloud API response parse failed: {e}"))
}
}
/// Check if fetched HTML looks like a bot protection challenge page.
/// Detects common bot protection challenge pages.
pub fn is_bot_protected(html: &str, headers: &HashMap<String, String>) -> bool {
let html_lower = html.to_lowercase();
// Cloudflare challenge page
if html_lower.contains("_cf_chl_opt") || html_lower.contains("challenge-platform") {
return true;
}
// Cloudflare "checking your browser" spinner
if (html_lower.contains("just a moment") || html_lower.contains("checking your browser"))
&& html_lower.contains("cf-spinner")
{
return true;
}
// Cloudflare Turnstile (only on short pages = challenge, not embedded on real content)
if (html_lower.contains("cf-turnstile")
|| html_lower.contains("challenges.cloudflare.com/turnstile"))
&& html.len() < 100_000
{
return true;
}
// DataDome
if html_lower.contains("geo.captcha-delivery.com")
|| html_lower.contains("captcha-delivery.com/captcha")
{
return true;
}
// AWS WAF
if html_lower.contains("awswaf-captcha") || html_lower.contains("aws-waf-client-browser") {
return true;
}
// hCaptcha blocking page
if html_lower.contains("hcaptcha.com")
&& html_lower.contains("h-captcha")
&& html.len() < 50_000
{
return true;
}
// Cloudflare via headers + challenge body
let has_cf_headers = headers
.iter()
.any(|(k, _)| k.eq_ignore_ascii_case("cf-ray") || k.eq_ignore_ascii_case("cf-mitigated"));
if has_cf_headers
&& (html_lower.contains("just a moment") || html_lower.contains("checking your browser"))
{
return true;
}
false
}
/// Check if a page likely needs JS rendering (SPA with almost no text content).
pub fn needs_js_rendering(word_count: usize, html: &str) -> bool {
let has_scripts = html.contains("<script");
// Tier 1: almost no extractable text from a large page
if word_count < 50 && html.len() > 5_000 && has_scripts {
return true;
}
// Tier 2: SPA framework detected with suspiciously low content-to-HTML ratio
if word_count < 800 && html.len() > 50_000 && has_scripts {
let html_lower = html.to_lowercase();
let has_spa_marker = html_lower.contains("react-app")
|| html_lower.contains("id=\"__next\"")
|| html_lower.contains("id=\"root\"")
|| html_lower.contains("id=\"app\"")
|| html_lower.contains("__next_data__")
|| html_lower.contains("nuxt")
|| html_lower.contains("ng-app");
if has_spa_marker {
return true;
}
}
false
}
/// Result of a smart fetch: either local extraction or cloud API response.
pub enum SmartFetchResult {
/// Successfully extracted locally.
Local(Box<webclaw_core::ExtractionResult>),
/// Fell back to cloud API. Contains the API response JSON.
Cloud(Value),
}
/// Try local fetch first, fall back to cloud API if bot-protected or JS-rendered.
///
/// Returns the extraction result (local) or the cloud API response JSON.
/// If no API key is configured and local fetch is blocked, returns an error
/// with a helpful message.
pub async fn smart_fetch(
client: &webclaw_fetch::FetchClient,
cloud: Option<&CloudClient>,
url: &str,
include_selectors: &[String],
exclude_selectors: &[String],
only_main_content: bool,
formats: &[&str],
) -> Result<SmartFetchResult, String> {
// Step 1: Try local fetch
let fetch_result = client
.fetch(url)
.await
.map_err(|e| format!("Fetch failed: {e}"))?;
// Step 2: Check for bot protection
if is_bot_protected(&fetch_result.html, &fetch_result.headers) {
info!(url, "bot protection detected, falling back to cloud API");
return cloud_fallback(
cloud,
url,
include_selectors,
exclude_selectors,
only_main_content,
formats,
)
.await;
}
// Step 3: Extract locally
let options = webclaw_core::ExtractionOptions {
include_selectors: include_selectors.to_vec(),
exclude_selectors: exclude_selectors.to_vec(),
only_main_content,
include_raw_html: false,
};
let extraction =
webclaw_core::extract_with_options(&fetch_result.html, Some(&fetch_result.url), &options)
.map_err(|e| format!("Extraction failed: {e}"))?;
// Step 4: Check for JS-rendered pages (low content from large HTML)
if needs_js_rendering(extraction.metadata.word_count, &fetch_result.html) {
info!(
url,
word_count = extraction.metadata.word_count,
html_len = fetch_result.html.len(),
"JS-rendered page detected, falling back to cloud API"
);
return cloud_fallback(
cloud,
url,
include_selectors,
exclude_selectors,
only_main_content,
formats,
)
.await;
}
Ok(SmartFetchResult::Local(Box::new(extraction)))
}
async fn cloud_fallback(
cloud: Option<&CloudClient>,
url: &str,
include_selectors: &[String],
exclude_selectors: &[String],
only_main_content: bool,
formats: &[&str],
) -> Result<SmartFetchResult, String> {
match cloud {
Some(c) => {
let resp = c
.scrape(
url,
formats,
include_selectors,
exclude_selectors,
only_main_content,
)
.await?;
info!(url, "cloud API fallback successful");
Ok(SmartFetchResult::Cloud(resp))
}
None => Err(format!(
"Bot protection detected on {url}. Set WEBCLAW_API_KEY for automatic cloud bypass. \
Get a key at https://webclaw.io"
)),
}
}

View file

@ -0,0 +1,28 @@
/// webclaw-mcp: MCP (Model Context Protocol) server for webclaw.
/// Exposes web extraction tools over stdio transport for AI agents
/// like Claude Desktop, Claude Code, and other MCP clients.
mod cloud;
mod server;
mod tools;
use rmcp::ServiceExt;
use rmcp::transport::stdio;
use server::WebclawMcp;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
dotenvy::dotenv().ok();
// Log to stderr -- stdout is the MCP transport channel
tracing_subscriber::fmt()
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
.with_writer(std::io::stderr)
.with_ansi(false)
.init();
let service = WebclawMcp::new().await.serve(stdio()).await?;
service.waiting().await?;
Ok(())
}

View file

@ -0,0 +1,507 @@
/// MCP server implementation for webclaw.
/// Exposes web extraction capabilities as tools for AI agents.
///
/// Uses a local-first architecture: fetches pages directly, then falls back
/// to the webclaw cloud API (api.webclaw.io) when bot protection or
/// JS rendering is detected. Set WEBCLAW_API_KEY for automatic fallback.
use std::sync::Arc;
use rmcp::handler::server::router::tool::ToolRouter;
use rmcp::handler::server::wrapper::Parameters;
use rmcp::model::{Implementation, ServerCapabilities, ServerInfo};
use rmcp::{ServerHandler, tool, tool_handler, tool_router};
use serde_json::json;
use tracing::{info, warn};
use crate::cloud::{self, CloudClient, SmartFetchResult};
use crate::tools::*;
pub struct WebclawMcp {
tool_router: ToolRouter<Self>,
fetch_client: Arc<webclaw_fetch::FetchClient>,
llm_chain: Option<webclaw_llm::ProviderChain>,
cloud: Option<CloudClient>,
}
/// Parse a browser string into a BrowserProfile.
fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile {
match browser {
Some("firefox") => webclaw_fetch::BrowserProfile::Firefox,
Some("random") => webclaw_fetch::BrowserProfile::Random,
_ => webclaw_fetch::BrowserProfile::Chrome,
}
}
#[tool_router]
impl WebclawMcp {
pub async fn new() -> Self {
let mut config = webclaw_fetch::FetchConfig::default();
// Auto-load proxies.txt if present
if std::path::Path::new("proxies.txt").exists()
&& let Ok(pool) = webclaw_fetch::parse_proxy_file("proxies.txt")
&& !pool.is_empty()
{
info!(count = pool.len(), "loaded proxy pool from proxies.txt");
config.proxy_pool = pool;
}
let fetch_client =
webclaw_fetch::FetchClient::new(config).expect("failed to build FetchClient");
let chain = webclaw_llm::ProviderChain::default().await;
let llm_chain = if chain.is_empty() {
warn!("no LLM providers available -- extract/summarize tools will fail");
None
} else {
info!(providers = chain.len(), "LLM provider chain ready");
Some(chain)
};
let cloud = CloudClient::from_env();
if cloud.is_some() {
info!("cloud API fallback enabled (WEBCLAW_API_KEY set)");
} else {
warn!(
"WEBCLAW_API_KEY not set -- bot-protected sites will return challenge pages. \
Get a key at https://webclaw.io"
);
}
Self {
tool_router: Self::tool_router(),
fetch_client: Arc::new(fetch_client),
llm_chain,
cloud,
}
}
/// Helper: smart fetch with LLM format for extract/summarize tools.
async fn smart_fetch_llm(&self, url: &str) -> Result<SmartFetchResult, String> {
cloud::smart_fetch(
&self.fetch_client,
self.cloud.as_ref(),
url,
&[],
&[],
false,
&["llm", "markdown"],
)
.await
}
/// Scrape a single URL and extract its content as markdown, LLM-optimized text, plain text, or full JSON.
/// Automatically falls back to the webclaw cloud API when bot protection or JS rendering is detected.
#[tool]
async fn scrape(&self, Parameters(params): Parameters<ScrapeParams>) -> Result<String, String> {
let format = params.format.as_deref().unwrap_or("markdown");
let browser = parse_browser(params.browser.as_deref());
let include = params.include_selectors.unwrap_or_default();
let exclude = params.exclude_selectors.unwrap_or_default();
let main_only = params.only_main_content.unwrap_or(false);
// Use a custom client if a non-default browser is requested
let is_default_browser = matches!(browser, webclaw_fetch::BrowserProfile::Chrome);
let custom_client;
let client: &webclaw_fetch::FetchClient = if is_default_browser {
&self.fetch_client
} else {
let config = webclaw_fetch::FetchConfig {
browser,
..Default::default()
};
custom_client = webclaw_fetch::FetchClient::new(config)
.map_err(|e| format!("Failed to build client: {e}"))?;
&custom_client
};
let formats = [format];
let result = cloud::smart_fetch(
client,
self.cloud.as_ref(),
&params.url,
&include,
&exclude,
main_only,
&formats,
)
.await?;
match result {
SmartFetchResult::Local(extraction) => {
let output = match format {
"llm" => webclaw_core::to_llm_text(&extraction, Some(&params.url)),
"text" => extraction.content.plain_text,
"json" => serde_json::to_string_pretty(&extraction).unwrap_or_default(),
_ => extraction.content.markdown,
};
Ok(output)
}
SmartFetchResult::Cloud(resp) => {
// Extract the requested format from the API response
let content = resp
.get(format)
.or_else(|| resp.get("markdown"))
.and_then(|v| v.as_str())
.unwrap_or("");
if content.is_empty() {
// Return full JSON if no content in the expected format
Ok(serde_json::to_string_pretty(&resp).unwrap_or_default())
} else {
Ok(content.to_string())
}
}
}
}
/// Crawl a website starting from a seed URL, following links breadth-first up to a configurable depth and page limit.
#[tool]
async fn crawl(&self, Parameters(params): Parameters<CrawlParams>) -> Result<String, String> {
let format = params.format.as_deref().unwrap_or("markdown");
let config = webclaw_fetch::CrawlConfig {
max_depth: params.depth.unwrap_or(2) as usize,
max_pages: params.max_pages.unwrap_or(50),
concurrency: params.concurrency.unwrap_or(5),
use_sitemap: params.use_sitemap.unwrap_or(false),
..Default::default()
};
let crawler = webclaw_fetch::Crawler::new(&params.url, config)
.map_err(|e| format!("Crawler init failed: {e}"))?;
let result = crawler.crawl(&params.url).await;
let mut output = format!(
"Crawled {} pages ({} ok, {} errors) in {:.1}s\n\n",
result.total, result.ok, result.errors, result.elapsed_secs
);
for page in &result.pages {
output.push_str(&format!("--- {} (depth {}) ---\n", page.url, page.depth));
if let Some(ref extraction) = page.extraction {
let content = match format {
"llm" => webclaw_core::to_llm_text(extraction, Some(&page.url)),
"text" => extraction.content.plain_text.clone(),
_ => extraction.content.markdown.clone(),
};
output.push_str(&content);
} else if let Some(ref err) = page.error {
output.push_str(&format!("Error: {err}"));
}
output.push_str("\n\n");
}
Ok(output)
}
/// Discover URLs from a website's sitemaps (robots.txt + sitemap.xml).
#[tool]
async fn map(&self, Parameters(params): Parameters<MapParams>) -> Result<String, String> {
let entries = webclaw_fetch::sitemap::discover(&self.fetch_client, &params.url)
.await
.map_err(|e| format!("Sitemap discovery failed: {e}"))?;
let urls: Vec<&str> = entries.iter().map(|e| e.url.as_str()).collect();
Ok(format!(
"Discovered {} URLs:\n\n{}",
urls.len(),
urls.join("\n")
))
}
/// Extract content from multiple URLs concurrently.
#[tool]
async fn batch(&self, Parameters(params): Parameters<BatchParams>) -> Result<String, String> {
if params.urls.is_empty() {
return Err("urls must not be empty".into());
}
let format = params.format.as_deref().unwrap_or("markdown");
let concurrency = params.concurrency.unwrap_or(5);
let url_refs: Vec<&str> = params.urls.iter().map(String::as_str).collect();
let results = self
.fetch_client
.fetch_and_extract_batch(&url_refs, concurrency)
.await;
let mut output = format!("Extracted {} URLs:\n\n", results.len());
for r in &results {
output.push_str(&format!("--- {} ---\n", r.url));
match &r.result {
Ok(extraction) => {
let content = match format {
"llm" => webclaw_core::to_llm_text(extraction, Some(&r.url)),
"text" => extraction.content.plain_text.clone(),
_ => extraction.content.markdown.clone(),
};
output.push_str(&content);
}
Err(e) => {
output.push_str(&format!("Error: {e}"));
}
}
output.push_str("\n\n");
}
Ok(output)
}
/// Extract structured data from a web page using an LLM. Provide either a JSON schema or a natural language prompt.
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
#[tool]
async fn extract(
&self,
Parameters(params): Parameters<ExtractParams>,
) -> Result<String, String> {
let chain = self.llm_chain.as_ref().ok_or(
"No LLM providers available. Set OPENAI_API_KEY or ANTHROPIC_API_KEY, or run Ollama locally.",
)?;
if params.schema.is_none() && params.prompt.is_none() {
return Err("Either 'schema' or 'prompt' is required for extraction.".into());
}
// For extract, if we get a cloud fallback we call the cloud extract endpoint directly
let llm_content = match self.smart_fetch_llm(&params.url).await? {
SmartFetchResult::Local(extraction) => {
webclaw_core::to_llm_text(&extraction, Some(&params.url))
}
SmartFetchResult::Cloud(resp) => {
// Use the LLM format from cloud, fall back to markdown
resp.get("llm")
.or_else(|| resp.get("markdown"))
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string()
}
};
let data = if let Some(ref schema) = params.schema {
webclaw_llm::extract::extract_json(&llm_content, schema, chain, None)
.await
.map_err(|e| format!("LLM extraction failed: {e}"))?
} else {
let prompt = params.prompt.as_deref().unwrap();
webclaw_llm::extract::extract_with_prompt(&llm_content, prompt, chain, None)
.await
.map_err(|e| format!("LLM extraction failed: {e}"))?
};
Ok(serde_json::to_string_pretty(&data).unwrap_or_default())
}
/// Summarize the content of a web page using an LLM.
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
#[tool]
async fn summarize(
&self,
Parameters(params): Parameters<SummarizeParams>,
) -> Result<String, String> {
let chain = self.llm_chain.as_ref().ok_or(
"No LLM providers available. Set OPENAI_API_KEY or ANTHROPIC_API_KEY, or run Ollama locally.",
)?;
let llm_content = match self.smart_fetch_llm(&params.url).await? {
SmartFetchResult::Local(extraction) => {
webclaw_core::to_llm_text(&extraction, Some(&params.url))
}
SmartFetchResult::Cloud(resp) => resp
.get("llm")
.or_else(|| resp.get("markdown"))
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string(),
};
webclaw_llm::summarize::summarize(&llm_content, params.max_sentences, chain, None)
.await
.map_err(|e| format!("Summarization failed: {e}"))
}
/// Compare the current content of a URL against a previous extraction snapshot, showing what changed.
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
#[tool]
async fn diff(&self, Parameters(params): Parameters<DiffParams>) -> Result<String, String> {
let previous: webclaw_core::ExtractionResult =
serde_json::from_str(&params.previous_snapshot)
.map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?;
let result = cloud::smart_fetch(
&self.fetch_client,
self.cloud.as_ref(),
&params.url,
&[],
&[],
false,
&["markdown"],
)
.await?;
match result {
SmartFetchResult::Local(current) => {
let content_diff = webclaw_core::diff::diff(&previous, &current);
Ok(serde_json::to_string_pretty(&content_diff).unwrap_or_default())
}
SmartFetchResult::Cloud(resp) => {
// Can't do local diff with cloud content, return the cloud response directly
Ok(serde_json::to_string_pretty(&resp).unwrap_or_default())
}
}
}
/// Extract brand identity (colors, fonts, logo, favicon) from a website's HTML and CSS.
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
#[tool]
async fn brand(&self, Parameters(params): Parameters<BrandParams>) -> Result<String, String> {
let fetch_result = self
.fetch_client
.fetch(&params.url)
.await
.map_err(|e| format!("Fetch failed: {e}"))?;
// Check for bot protection before extracting brand
if cloud::is_bot_protected(&fetch_result.html, &fetch_result.headers) {
if let Some(ref c) = self.cloud {
let resp = c
.post("brand", serde_json::json!({"url": params.url}))
.await?;
return Ok(serde_json::to_string_pretty(&resp).unwrap_or_default());
} else {
return Err(format!(
"Bot protection detected on {}. Set WEBCLAW_API_KEY for automatic cloud bypass. \
Get a key at https://webclaw.io",
params.url
));
}
}
let identity =
webclaw_core::brand::extract_brand(&fetch_result.html, Some(&fetch_result.url));
Ok(serde_json::to_string_pretty(&identity).unwrap_or_default())
}
/// Run a deep research investigation on a topic or question. Requires WEBCLAW_API_KEY.
/// Starts an async research job on the webclaw cloud API, then polls until complete.
#[tool]
async fn research(
&self,
Parameters(params): Parameters<ResearchParams>,
) -> Result<String, String> {
let cloud = self
.cloud
.as_ref()
.ok_or("Research requires WEBCLAW_API_KEY. Get a key at https://webclaw.io")?;
let mut body = json!({ "query": params.query });
if let Some(deep) = params.deep {
body["deep"] = json!(deep);
}
if let Some(ref topic) = params.topic {
body["topic"] = json!(topic);
}
// Start the research job
let start_resp = cloud.post("research", body).await?;
let job_id = start_resp
.get("id")
.and_then(|v| v.as_str())
.ok_or("Research API did not return a job ID")?
.to_string();
info!(job_id = %job_id, "research job started, polling for completion");
// Poll until completed or failed
loop {
tokio::time::sleep(std::time::Duration::from_secs(3)).await;
let status_resp = cloud.get(&format!("research/{job_id}")).await?;
let status = status_resp
.get("status")
.and_then(|v| v.as_str())
.unwrap_or("unknown");
match status {
"completed" => {
let report = status_resp
.get("report")
.and_then(|v| v.as_str())
.unwrap_or("");
if report.is_empty() {
return Ok(serde_json::to_string_pretty(&status_resp).unwrap_or_default());
}
return Ok(report.to_string());
}
"failed" => {
let error = status_resp
.get("error")
.and_then(|v| v.as_str())
.unwrap_or("unknown error");
return Err(format!("Research job failed: {error}"));
}
_ => {
// Still processing, continue polling
}
}
}
}
/// Search the web for a query and return structured results. Requires WEBCLAW_API_KEY.
#[tool]
async fn search(&self, Parameters(params): Parameters<SearchParams>) -> Result<String, String> {
let cloud = self
.cloud
.as_ref()
.ok_or("Search requires WEBCLAW_API_KEY. Get a key at https://webclaw.io")?;
let mut body = json!({ "query": params.query });
if let Some(num) = params.num_results {
body["num_results"] = json!(num);
}
let resp = cloud.post("search", body).await?;
// Format results for readability
if let Some(results) = resp.get("results").and_then(|v| v.as_array()) {
let mut output = format!("Found {} results:\n\n", results.len());
for (i, result) in results.iter().enumerate() {
let title = result.get("title").and_then(|v| v.as_str()).unwrap_or("");
let url = result.get("url").and_then(|v| v.as_str()).unwrap_or("");
let snippet = result
.get("snippet")
.or_else(|| result.get("description"))
.and_then(|v| v.as_str())
.unwrap_or("");
output.push_str(&format!(
"{}. {}\n {}\n {}\n\n",
i + 1,
title,
url,
snippet
));
}
Ok(output)
} else {
// Fallback: return raw JSON if unexpected shape
Ok(serde_json::to_string_pretty(&resp).unwrap_or_default())
}
}
}
#[tool_handler]
impl ServerHandler for WebclawMcp {
fn get_info(&self) -> ServerInfo {
ServerInfo::new(ServerCapabilities::builder().enable_tools().build())
.with_server_info(Implementation::from_build_env())
.with_instructions(String::from(
"Webclaw MCP server -- web content extraction for AI agents. \
Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search.",
))
}
}

View file

@ -0,0 +1,103 @@
/// Tool parameter structs for MCP tool inputs.
/// Each struct derives JsonSchema for automatic schema generation,
/// and Deserialize for parsing from MCP tool call arguments.
use schemars::JsonSchema;
use serde::Deserialize;
#[derive(Debug, Deserialize, JsonSchema)]
pub struct ScrapeParams {
/// URL to scrape
pub url: String,
/// Output format: "markdown" (default), "llm", "text", or "json"
pub format: Option<String>,
/// CSS selectors to include (only extract matching elements)
pub include_selectors: Option<Vec<String>>,
/// CSS selectors to exclude from output
pub exclude_selectors: Option<Vec<String>>,
/// If true, extract only the main content (article/main element)
pub only_main_content: Option<bool>,
/// Browser profile: "chrome" (default), "firefox", or "random"
pub browser: Option<String>,
}
#[derive(Debug, Deserialize, JsonSchema)]
pub struct CrawlParams {
/// Seed URL to start crawling from
pub url: String,
/// Maximum link depth to follow (default: 2)
pub depth: Option<u32>,
/// Maximum number of pages to crawl (default: 50)
pub max_pages: Option<usize>,
/// Number of concurrent requests (default: 5)
pub concurrency: Option<usize>,
/// Seed the frontier from sitemap discovery before crawling
pub use_sitemap: Option<bool>,
/// Output format for each page: "markdown" (default), "llm", "text"
pub format: Option<String>,
}
#[derive(Debug, Deserialize, JsonSchema)]
pub struct MapParams {
/// Base URL to discover sitemaps from (e.g. `<https://example.com>`)
pub url: String,
}
#[derive(Debug, Deserialize, JsonSchema)]
pub struct BatchParams {
/// List of URLs to extract content from
pub urls: Vec<String>,
/// Output format: "markdown" (default), "llm", "text"
pub format: Option<String>,
/// Number of concurrent requests (default: 5)
pub concurrency: Option<usize>,
}
#[derive(Debug, Deserialize, JsonSchema)]
pub struct ExtractParams {
/// URL to fetch and extract structured data from
pub url: String,
/// Natural language prompt describing what to extract
pub prompt: Option<String>,
/// JSON schema describing the structure to extract
pub schema: Option<serde_json::Value>,
}
#[derive(Debug, Deserialize, JsonSchema)]
pub struct SummarizeParams {
/// URL to fetch and summarize
pub url: String,
/// Number of sentences in the summary (default: 3)
pub max_sentences: Option<usize>,
}
#[derive(Debug, Deserialize, JsonSchema)]
pub struct DiffParams {
/// URL to fetch current content from
pub url: String,
/// Previous extraction snapshot as a JSON string (ExtractionResult)
pub previous_snapshot: String,
}
#[derive(Debug, Deserialize, JsonSchema)]
pub struct BrandParams {
/// URL to extract brand identity from
pub url: String,
}
#[derive(Debug, Deserialize, JsonSchema)]
pub struct ResearchParams {
/// Research query or question to investigate
pub query: String,
/// Enable deep research mode for more thorough investigation (default: false)
pub deep: Option<bool>,
/// Topic hint to guide research focus (e.g. "technology", "finance", "science")
pub topic: Option<String>,
}
#[derive(Debug, Deserialize, JsonSchema)]
pub struct SearchParams {
/// Search query
pub query: String,
/// Number of results to return (default: 10)
pub num_results: Option<u32>,
}