mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
fix(security): prepare 0.6.1 hardening
Merge the 0.6.1 security hardening release candidate after local and CI verification.
This commit is contained in:
commit
a629534490
12 changed files with 216 additions and 54 deletions
30
.github/workflows/release.yml
vendored
30
.github/workflows/release.yml
vendored
|
|
@ -5,14 +5,15 @@ on:
|
|||
tags: ["v*"]
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
packages: write
|
||||
contents: read
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
|
||||
jobs:
|
||||
build:
|
||||
permissions:
|
||||
contents: read
|
||||
name: Build ${{ matrix.target }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
|
|
@ -106,9 +107,9 @@ jobs:
|
|||
name: Release
|
||||
needs: build
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: artifacts
|
||||
|
|
@ -122,18 +123,23 @@ jobs:
|
|||
cat SHA256SUMS
|
||||
|
||||
- name: Create GitHub Release
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
generate_release_notes: true
|
||||
files: |
|
||||
artifacts/*.tar.gz
|
||||
artifacts/*.zip
|
||||
artifacts/SHA256SUMS
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
tag="${GITHUB_REF#refs/tags/}"
|
||||
gh release create "$tag" \
|
||||
artifacts/*.tar.gz \
|
||||
artifacts/*.zip \
|
||||
artifacts/SHA256SUMS \
|
||||
--generate-notes
|
||||
|
||||
docker:
|
||||
name: Docker
|
||||
needs: release
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
|
|
@ -193,6 +199,8 @@ jobs:
|
|||
name: Update Homebrew
|
||||
needs: [release, docker]
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
steps:
|
||||
- name: Compute all checksums and update formula
|
||||
env:
|
||||
|
|
|
|||
11
CHANGELOG.md
11
CHANGELOG.md
|
|
@ -3,6 +3,17 @@
|
|||
All notable changes to webclaw are documented here.
|
||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||
|
||||
## [0.6.1] — 2026-05-12
|
||||
|
||||
### Fixed
|
||||
- Hardened URL safety across the CLI, MCP server, and self-hosted API paths so local and private network targets are rejected more consistently, including after DNS resolution and redirects.
|
||||
- Added a timeout around inline JavaScript data extraction so hostile pages cannot keep the extractor busy forever.
|
||||
- Tightened Amazon and eBay URL recognition so deceptive hosts are rejected while common international marketplaces still work.
|
||||
- Avoided unnecessary decoding work on large responses during bot-challenge detection.
|
||||
- Reduced release workflow token permissions so build jobs run with narrower GitHub access.
|
||||
|
||||
---
|
||||
|
||||
## [0.6.0] — 2026-05-10
|
||||
|
||||
### Fixed
|
||||
|
|
|
|||
14
Cargo.lock
generated
14
Cargo.lock
generated
|
|
@ -3219,7 +3219,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-cli"
|
||||
version = "0.6.0"
|
||||
version = "0.6.1"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"dotenvy",
|
||||
|
|
@ -3240,7 +3240,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-core"
|
||||
version = "0.6.0"
|
||||
version = "0.6.1"
|
||||
dependencies = [
|
||||
"ego-tree",
|
||||
"once_cell",
|
||||
|
|
@ -3258,7 +3258,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-fetch"
|
||||
version = "0.6.0"
|
||||
version = "0.6.1"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
|
|
@ -3284,7 +3284,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-llm"
|
||||
version = "0.6.0"
|
||||
version = "0.6.1"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"reqwest",
|
||||
|
|
@ -3297,7 +3297,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-mcp"
|
||||
version = "0.6.0"
|
||||
version = "0.6.1"
|
||||
dependencies = [
|
||||
"dirs",
|
||||
"dotenvy",
|
||||
|
|
@ -3317,7 +3317,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-pdf"
|
||||
version = "0.6.0"
|
||||
version = "0.6.1"
|
||||
dependencies = [
|
||||
"pdf-extract",
|
||||
"thiserror",
|
||||
|
|
@ -3326,7 +3326,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-server"
|
||||
version = "0.6.0"
|
||||
version = "0.6.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
|||
members = ["crates/*"]
|
||||
|
||||
[workspace.package]
|
||||
version = "0.6.0"
|
||||
version = "0.6.1"
|
||||
edition = "2024"
|
||||
license = "AGPL-3.0"
|
||||
repository = "https://github.com/0xMassi/webclaw"
|
||||
|
|
|
|||
|
|
@ -849,11 +849,18 @@ async fn enrich_html_with_stylesheets(html: &str, base_url: &str) -> String {
|
|||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(5))
|
||||
.redirect(reqwest::redirect::Policy::none())
|
||||
.build()
|
||||
.unwrap_or_default();
|
||||
|
||||
let mut extra_css = String::new();
|
||||
for href in &hrefs {
|
||||
if webclaw_fetch::url_security::validate_public_http_url(href)
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if let Ok(resp) = client.get(href).send().await
|
||||
&& resp.status().is_success()
|
||||
&& let Ok(body) = resp.text().await
|
||||
|
|
|
|||
|
|
@ -9,10 +9,12 @@ use once_cell::sync::Lazy;
|
|||
use regex::Regex;
|
||||
use rquickjs::{Context, Runtime};
|
||||
use scraper::{Html, Selector};
|
||||
use std::time::{Duration, Instant};
|
||||
use tracing::debug;
|
||||
|
||||
static SCRIPT_SELECTOR: Lazy<Selector> = Lazy::new(|| Selector::parse("script").unwrap());
|
||||
static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
|
||||
const JS_EVAL_TIMEOUT: Duration = Duration::from_millis(250);
|
||||
|
||||
/// A blob of data extracted from JS execution.
|
||||
pub struct JsDataBlob {
|
||||
|
|
@ -49,6 +51,8 @@ pub fn extract_js_data(html: &str) -> Vec<JsDataBlob> {
|
|||
let rt = Runtime::new().expect("QuickJS runtime creation failed");
|
||||
rt.set_memory_limit(64 * 1024 * 1024); // 64 MB
|
||||
rt.set_max_stack_size(1024 * 1024); // 1 MB
|
||||
let deadline = Instant::now() + JS_EVAL_TIMEOUT;
|
||||
rt.set_interrupt_handler(Some(Box::new(move || Instant::now() >= deadline)));
|
||||
|
||||
let ctx = Context::full(&rt).expect("QuickJS context creation failed");
|
||||
|
||||
|
|
@ -464,6 +468,8 @@ fn walk_rsc_tree(value: &serde_json::Value, out: &mut Vec<String>, depth: usize)
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
|
|
@ -493,6 +499,29 @@ mod tests {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn js_eval_interrupts_infinite_loops() {
|
||||
let html = r#"
|
||||
<html>
|
||||
<head>
|
||||
<script>
|
||||
while (true) {}
|
||||
</script>
|
||||
</head>
|
||||
<body>hello</body>
|
||||
</html>
|
||||
"#;
|
||||
|
||||
let start = Instant::now();
|
||||
let blobs = extract_js_data(html);
|
||||
|
||||
assert!(blobs.is_empty());
|
||||
assert!(
|
||||
start.elapsed() < Duration::from_secs(2),
|
||||
"QuickJS execution should be interrupted quickly"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn skips_external_and_module_scripts() {
|
||||
let html = r#"<html><body>
|
||||
|
|
|
|||
|
|
@ -783,6 +783,10 @@ fn is_pdf_content_type(headers: &http::HeaderMap) -> bool {
|
|||
|
||||
/// Detect if a response looks like a bot protection challenge page.
|
||||
fn is_challenge_response(response: &Response) -> bool {
|
||||
let body_len = response.body().len();
|
||||
if body_len > 15_000 || body_len == 0 {
|
||||
return false;
|
||||
}
|
||||
is_challenge_html(response.text().as_ref())
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ use std::sync::OnceLock;
|
|||
|
||||
use regex::Regex;
|
||||
use serde_json::{Value, json};
|
||||
use url::Url;
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::cloud::{self, CloudError};
|
||||
|
|
@ -52,8 +53,10 @@ pub const INFO: ExtractorInfo = ExtractorInfo {
|
|||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
if !is_amazon_host(host) {
|
||||
let Some(host) = host_of(url) else {
|
||||
return false;
|
||||
};
|
||||
if !is_amazon_host(&host) {
|
||||
return false;
|
||||
}
|
||||
parse_asin(url).is_some()
|
||||
|
|
@ -162,17 +165,41 @@ pub fn parse(html: &str, url: &str, asin: &str) -> Value {
|
|||
// URL helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
fn host_of(url: &str) -> Option<String> {
|
||||
let parsed = Url::parse(url).ok()?;
|
||||
if !parsed.username().is_empty() || parsed.password().is_some() {
|
||||
return None;
|
||||
}
|
||||
parsed.host_str().map(|host| host.to_ascii_lowercase())
|
||||
}
|
||||
|
||||
fn is_amazon_host(host: &str) -> bool {
|
||||
host.starts_with("www.amazon.") || host.starts_with("amazon.")
|
||||
const AMAZON_HOSTS: &[&str] = &[
|
||||
"amazon.ae",
|
||||
"amazon.ca",
|
||||
"amazon.cn",
|
||||
"amazon.co.jp",
|
||||
"amazon.co.uk",
|
||||
"amazon.com",
|
||||
"amazon.com.au",
|
||||
"amazon.com.be",
|
||||
"amazon.com.br",
|
||||
"amazon.com.mx",
|
||||
"amazon.com.tr",
|
||||
"amazon.de",
|
||||
"amazon.eg",
|
||||
"amazon.es",
|
||||
"amazon.fr",
|
||||
"amazon.in",
|
||||
"amazon.it",
|
||||
"amazon.nl",
|
||||
"amazon.pl",
|
||||
"amazon.sa",
|
||||
"amazon.se",
|
||||
"amazon.sg",
|
||||
];
|
||||
let normalized = host.strip_prefix("www.").unwrap_or(host);
|
||||
AMAZON_HOSTS.contains(&normalized)
|
||||
}
|
||||
|
||||
/// Pull a 10-char ASIN out of any recognised Amazon URL shape:
|
||||
|
|
@ -347,6 +374,9 @@ mod tests {
|
|||
assert!(matches("https://www.amazon.com/dp/B0CHX1W1XY"));
|
||||
assert!(matches("https://www.amazon.co.uk/dp/B0CHX1W1XY/"));
|
||||
assert!(matches("https://www.amazon.de/dp/B0CHX1W1XY?psc=1"));
|
||||
assert!(matches("https://www.amazon.ca/dp/B0CHX1W1XY"));
|
||||
assert!(matches("https://www.amazon.com.au/dp/B0CHX1W1XY"));
|
||||
assert!(matches("https://www.amazon.in/dp/B0CHX1W1XY"));
|
||||
assert!(matches(
|
||||
"https://www.amazon.com/gp/product/B0CHX1W1XY/ref=foo"
|
||||
));
|
||||
|
|
@ -357,6 +387,8 @@ mod tests {
|
|||
assert!(!matches("https://www.amazon.com/"));
|
||||
assert!(!matches("https://www.amazon.com/gp/cart"));
|
||||
assert!(!matches("https://example.com/dp/B0CHX1W1XY"));
|
||||
assert!(!matches("https://www.amazon.com@127.0.0.1/dp/B0CHX1W1XY"));
|
||||
assert!(!matches("https://www.amazon.evil.com/dp/B0CHX1W1XY"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ use std::sync::OnceLock;
|
|||
|
||||
use regex::Regex;
|
||||
use serde_json::{Value, json};
|
||||
use url::Url;
|
||||
|
||||
use super::ExtractorInfo;
|
||||
use crate::cloud::{self, CloudError};
|
||||
|
|
@ -32,8 +33,10 @@ pub const INFO: ExtractorInfo = ExtractorInfo {
|
|||
};
|
||||
|
||||
pub fn matches(url: &str) -> bool {
|
||||
let host = host_of(url);
|
||||
if !is_ebay_host(host) {
|
||||
let Some(host) = host_of(url) else {
|
||||
return false;
|
||||
};
|
||||
if !is_ebay_host(&host) {
|
||||
return false;
|
||||
}
|
||||
parse_item_id(url).is_some()
|
||||
|
|
@ -120,17 +123,37 @@ pub fn parse(html: &str, url: &str, item_id: &str) -> Value {
|
|||
// URL helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn host_of(url: &str) -> &str {
|
||||
url.split("://")
|
||||
.nth(1)
|
||||
.unwrap_or(url)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or("")
|
||||
fn host_of(url: &str) -> Option<String> {
|
||||
let parsed = Url::parse(url).ok()?;
|
||||
if !parsed.username().is_empty() || parsed.password().is_some() {
|
||||
return None;
|
||||
}
|
||||
parsed.host_str().map(|host| host.to_ascii_lowercase())
|
||||
}
|
||||
|
||||
fn is_ebay_host(host: &str) -> bool {
|
||||
host.starts_with("www.ebay.") || host.starts_with("ebay.")
|
||||
const EBAY_HOSTS: &[&str] = &[
|
||||
"ebay.at",
|
||||
"ebay.be",
|
||||
"ebay.ca",
|
||||
"ebay.ch",
|
||||
"ebay.co.uk",
|
||||
"ebay.com",
|
||||
"ebay.com.au",
|
||||
"ebay.com.hk",
|
||||
"ebay.com.my",
|
||||
"ebay.com.sg",
|
||||
"ebay.de",
|
||||
"ebay.es",
|
||||
"ebay.fr",
|
||||
"ebay.ie",
|
||||
"ebay.it",
|
||||
"ebay.nl",
|
||||
"ebay.ph",
|
||||
"ebay.pl",
|
||||
];
|
||||
let normalized = host.strip_prefix("www.").unwrap_or(host);
|
||||
EBAY_HOSTS.contains(&normalized)
|
||||
}
|
||||
|
||||
/// Pull the numeric item id out of `/itm/{id}` or `/itm/{slug}/{id}`
|
||||
|
|
@ -273,9 +296,14 @@ mod tests {
|
|||
"https://www.ebay.com/itm/vintage-typewriter/325478156234"
|
||||
));
|
||||
assert!(matches("https://www.ebay.co.uk/itm/325478156234"));
|
||||
assert!(matches("https://www.ebay.ca/itm/325478156234"));
|
||||
assert!(matches("https://www.ebay.com.au/itm/325478156234"));
|
||||
assert!(matches("https://www.ebay.es/itm/325478156234"));
|
||||
assert!(!matches("https://www.ebay.com/"));
|
||||
assert!(!matches("https://www.ebay.com/sch/foo"));
|
||||
assert!(!matches("https://example.com/itm/325478156234"));
|
||||
assert!(!matches("https://www.ebay.com@127.0.0.1/itm/325478156234"));
|
||||
assert!(!matches("https://www.ebay.attacker.com/itm/325478156234"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -5,9 +5,7 @@
|
|||
//! PSK, ECH GREASE) and HTTP/2 options (SETTINGS order, pseudo-header order,
|
||||
//! stream dependency, priorities) to match real browser fingerprints.
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::{borrow::Cow, io, time::Duration};
|
||||
|
||||
use wreq::http2::{
|
||||
Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId,
|
||||
|
|
@ -21,6 +19,41 @@ use wreq::{Client, Emulation};
|
|||
use crate::browser::BrowserVariant;
|
||||
use crate::error::FetchError;
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
struct PublicDnsResolver;
|
||||
|
||||
impl wreq::dns::Resolve for PublicDnsResolver {
|
||||
fn resolve(&self, name: wreq::dns::Name) -> wreq::dns::Resolving {
|
||||
Box::pin(async move {
|
||||
let addrs = tokio::net::lookup_host((name.as_str(), 0))
|
||||
.await
|
||||
.map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>)?;
|
||||
let mut public = Vec::new();
|
||||
|
||||
for addr in addrs {
|
||||
if crate::url_security::is_blocked_ip(addr.ip()) {
|
||||
let err: Box<dyn std::error::Error + Send + Sync> = Box::new(io::Error::new(
|
||||
io::ErrorKind::PermissionDenied,
|
||||
"DNS resolved to a blocked private or internal address",
|
||||
));
|
||||
return Err(err);
|
||||
}
|
||||
public.push(addr);
|
||||
}
|
||||
|
||||
if public.is_empty() {
|
||||
let err: Box<dyn std::error::Error + Send + Sync> = Box::new(io::Error::new(
|
||||
io::ErrorKind::NotFound,
|
||||
"host did not resolve to any addresses",
|
||||
));
|
||||
return Err(err);
|
||||
}
|
||||
|
||||
Ok(Box::new(public.into_iter()) as wreq::dns::Addrs)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Chrome cipher list (TLS 1.3 + TLS 1.2 in Chrome's exact order).
|
||||
const CHROME_CIPHERS: &str = "TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384:TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256:TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA:TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA:TLS_RSA_WITH_AES_128_GCM_SHA256:TLS_RSA_WITH_AES_256_GCM_SHA384:TLS_RSA_WITH_AES_128_CBC_SHA:TLS_RSA_WITH_AES_256_CBC_SHA";
|
||||
|
||||
|
|
@ -503,6 +536,8 @@ pub fn build_client(
|
|||
let proxy =
|
||||
wreq::Proxy::all(proxy_url).map_err(|e| FetchError::Build(format!("proxy: {e}")))?;
|
||||
builder = builder.proxy(proxy);
|
||||
} else {
|
||||
builder = builder.dns_resolver(PublicDnsResolver);
|
||||
}
|
||||
|
||||
builder
|
||||
|
|
|
|||
|
|
@ -163,7 +163,9 @@ mod tests {
|
|||
Ipv4Addr::new(169, 254, 169, 254),
|
||||
Ipv4Addr::new(172, 16, 0, 1),
|
||||
Ipv4Addr::new(192, 168, 0, 1),
|
||||
Ipv4Addr::new(192, 0, 0, 8),
|
||||
Ipv4Addr::new(198, 18, 0, 1),
|
||||
Ipv4Addr::new(255, 255, 255, 255),
|
||||
] {
|
||||
let url = format!("http://{ip}/");
|
||||
assert!(validate_public_http_url(&url).await.is_err(), "{ip}");
|
||||
|
|
@ -193,4 +195,9 @@ mod tests {
|
|||
);
|
||||
assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn blocks_localhost_domains_after_resolution() {
|
||||
assert!(validate_public_http_url("http://localhost/").await.is_err());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -51,9 +51,10 @@ fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile {
|
|||
}
|
||||
}
|
||||
|
||||
/// Validate that a URL is non-empty and has an http or https scheme.
|
||||
fn validate_url(url: &str) -> Result<(), String> {
|
||||
webclaw_fetch::url_security::validate_http_url(url)
|
||||
/// Validate that a URL is public HTTP(S), matching the fetch-layer SSRF guard.
|
||||
async fn validate_url(url: &str) -> Result<(), String> {
|
||||
webclaw_fetch::url_security::validate_public_http_url(url)
|
||||
.await
|
||||
.map(|_| ())
|
||||
.map_err(|e| format!("Invalid URL: {e}"))
|
||||
}
|
||||
|
|
@ -161,7 +162,7 @@ impl WebclawMcp {
|
|||
/// Automatically falls back to the webclaw cloud API when bot protection or JS rendering is detected.
|
||||
#[tool]
|
||||
async fn scrape(&self, Parameters(params): Parameters<ScrapeParams>) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
validate_url(¶ms.url).await?;
|
||||
let format = params.format.as_deref().unwrap_or("markdown");
|
||||
let browser = parse_browser(params.browser.as_deref());
|
||||
let include = params.include_selectors.unwrap_or_default();
|
||||
|
|
@ -251,7 +252,7 @@ impl WebclawMcp {
|
|||
/// Crawl a website starting from a seed URL, following links breadth-first up to a configurable depth and page limit.
|
||||
#[tool]
|
||||
async fn crawl(&self, Parameters(params): Parameters<CrawlParams>) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
validate_url(¶ms.url).await?;
|
||||
|
||||
if let Some(max) = params.max_pages
|
||||
&& max > 500
|
||||
|
|
@ -300,7 +301,7 @@ impl WebclawMcp {
|
|||
/// Discover URLs from a website's sitemaps (robots.txt + sitemap.xml).
|
||||
#[tool]
|
||||
async fn map(&self, Parameters(params): Parameters<MapParams>) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
validate_url(¶ms.url).await?;
|
||||
let entries = webclaw_fetch::sitemap::discover(&self.fetch_client, ¶ms.url)
|
||||
.await
|
||||
.map_err(|e| format!("Sitemap discovery failed: {e}"))?;
|
||||
|
|
@ -323,7 +324,7 @@ impl WebclawMcp {
|
|||
return Err("batch is limited to 100 URLs per request".into());
|
||||
}
|
||||
for u in ¶ms.urls {
|
||||
validate_url(u)?;
|
||||
validate_url(u).await?;
|
||||
}
|
||||
|
||||
let format = params.format.as_deref().unwrap_or("markdown");
|
||||
|
|
@ -365,7 +366,7 @@ impl WebclawMcp {
|
|||
&self,
|
||||
Parameters(params): Parameters<ExtractParams>,
|
||||
) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
validate_url(¶ms.url).await?;
|
||||
|
||||
if params.schema.is_none() && params.prompt.is_none() {
|
||||
return Err("Either 'schema' or 'prompt' is required for extraction.".into());
|
||||
|
|
@ -422,7 +423,7 @@ impl WebclawMcp {
|
|||
&self,
|
||||
Parameters(params): Parameters<SummarizeParams>,
|
||||
) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
validate_url(¶ms.url).await?;
|
||||
|
||||
// No local LLM — fall back to cloud API directly
|
||||
if self.llm_chain.is_none() {
|
||||
|
|
@ -464,7 +465,7 @@ impl WebclawMcp {
|
|||
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
|
||||
#[tool]
|
||||
async fn diff(&self, Parameters(params): Parameters<DiffParams>) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
validate_url(¶ms.url).await?;
|
||||
let previous: webclaw_core::ExtractionResult =
|
||||
serde_json::from_str(¶ms.previous_snapshot)
|
||||
.map_err(|e| format!("Failed to parse previous_snapshot JSON: {e}"))?;
|
||||
|
|
@ -532,7 +533,7 @@ impl WebclawMcp {
|
|||
/// Automatically falls back to the webclaw cloud API when bot protection is detected.
|
||||
#[tool]
|
||||
async fn brand(&self, Parameters(params): Parameters<BrandParams>) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
validate_url(¶ms.url).await?;
|
||||
let fetch_result =
|
||||
tokio::time::timeout(LOCAL_FETCH_TIMEOUT, self.fetch_client.fetch(¶ms.url))
|
||||
.await
|
||||
|
|
@ -737,7 +738,7 @@ impl WebclawMcp {
|
|||
&self,
|
||||
Parameters(params): Parameters<VerticalParams>,
|
||||
) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
validate_url(¶ms.url).await?;
|
||||
// Use the cached Firefox client, not the default Chrome one.
|
||||
// Reddit's `.json` endpoint rejects the wreq-Chrome TLS
|
||||
// fingerprint with a 403 even from residential IPs (they
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue