mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
fix: harden fetch URL validation
This commit is contained in:
parent
23544f8fac
commit
bdf81fe6bf
10 changed files with 284 additions and 27 deletions
|
|
@ -73,11 +73,9 @@ COPY --from=builder /build/target/release/webclaw-server /usr/local/bin/webclaw-
|
||||||
# as documentation; callers still need `-p 3000:3000` on `docker run`.
|
# as documentation; callers still need `-p 3000:3000` on `docker run`.
|
||||||
EXPOSE 3000
|
EXPOSE 3000
|
||||||
|
|
||||||
# Container default: bind all interfaces so `-p 3000:3000` works. The binary
|
# Container default: bind all interfaces so `-p 3000:3000` works. Public
|
||||||
# itself defaults to 127.0.0.1 (safe for `cargo run` on a laptop); inside
|
# binding requires WEBCLAW_API_KEY; the binary refuses open-auth 0.0.0.0
|
||||||
# Docker that would make the server unreachable, so we flip it here.
|
# unless WEBCLAW_ALLOW_OPEN_PUBLIC=1 is set explicitly for local testing.
|
||||||
# Override with -e WEBCLAW_HOST=127.0.0.1 if you front this with another
|
|
||||||
# process in the same container.
|
|
||||||
ENV WEBCLAW_HOST=0.0.0.0
|
ENV WEBCLAW_HOST=0.0.0.0
|
||||||
|
|
||||||
# Entrypoint shim: forwards webclaw args/URL to the binary, but exec's other
|
# Entrypoint shim: forwards webclaw args/URL to the binary, but exec's other
|
||||||
|
|
|
||||||
|
|
@ -199,6 +199,8 @@ impl FetchClient {
|
||||||
config.timeout,
|
config.timeout,
|
||||||
&config.headers,
|
&config.headers,
|
||||||
config.proxy.as_deref(),
|
config.proxy.as_deref(),
|
||||||
|
config.follow_redirects,
|
||||||
|
config.max_redirects,
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
.collect::<Result<Vec<_>, _>>()?;
|
.collect::<Result<Vec<_>, _>>()?;
|
||||||
|
|
@ -218,7 +220,14 @@ impl FetchClient {
|
||||||
.iter()
|
.iter()
|
||||||
.map(|proxy| {
|
.map(|proxy| {
|
||||||
let v = *variants.choose(&mut rng).unwrap();
|
let v = *variants.choose(&mut rng).unwrap();
|
||||||
crate::tls::build_client(v, config.timeout, &config.headers, Some(proxy))
|
crate::tls::build_client(
|
||||||
|
v,
|
||||||
|
config.timeout,
|
||||||
|
&config.headers,
|
||||||
|
Some(proxy),
|
||||||
|
config.follow_redirects,
|
||||||
|
config.max_redirects,
|
||||||
|
)
|
||||||
})
|
})
|
||||||
.collect::<Result<Vec<_>, _>>()?;
|
.collect::<Result<Vec<_>, _>>()?;
|
||||||
|
|
||||||
|
|
@ -379,6 +388,8 @@ impl FetchClient {
|
||||||
url: &str,
|
url: &str,
|
||||||
extra: &[(&str, &str)],
|
extra: &[(&str, &str)],
|
||||||
) -> Result<FetchResult, FetchError> {
|
) -> Result<FetchResult, FetchError> {
|
||||||
|
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
|
||||||
|
let url = parsed_url.as_str();
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
let client = self.pick_client(url);
|
let client = self.pick_client(url);
|
||||||
|
|
||||||
|
|
@ -463,13 +474,17 @@ impl FetchClient {
|
||||||
url: &str,
|
url: &str,
|
||||||
options: &webclaw_core::ExtractionOptions,
|
options: &webclaw_core::ExtractionOptions,
|
||||||
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
||||||
|
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
|
||||||
|
let url = parsed_url.as_str();
|
||||||
|
|
||||||
// Reddit fallback: use their JSON API to get post + full comment tree.
|
// Reddit fallback: use their JSON API to get post + full comment tree.
|
||||||
if crate::reddit::is_reddit_url(url) {
|
if crate::reddit::is_reddit_url(url) {
|
||||||
let json_url = crate::reddit::json_url(url);
|
let json_url = crate::reddit::json_url(url);
|
||||||
|
let json_url = crate::url_security::validate_public_http_url(&json_url).await?;
|
||||||
debug!("reddit detected, fetching {json_url}");
|
debug!("reddit detected, fetching {json_url}");
|
||||||
|
|
||||||
let client = self.pick_client(url);
|
let client = self.pick_client(url);
|
||||||
let resp = client.get(&json_url).send().await?;
|
let resp = client.get(json_url.as_str()).send().await?;
|
||||||
let response = Response::from_wreq(resp).await?;
|
let response = Response::from_wreq(resp).await?;
|
||||||
if response.is_success() {
|
if response.is_success() {
|
||||||
let bytes = response.body();
|
let bytes = response.body();
|
||||||
|
|
@ -491,7 +506,7 @@ impl FetchClient {
|
||||||
&& let Some(homepage) = extract_homepage(url)
|
&& let Some(homepage) = extract_homepage(url)
|
||||||
{
|
{
|
||||||
debug!("challenge detected, warming cookies via {homepage}");
|
debug!("challenge detected, warming cookies via {homepage}");
|
||||||
let _ = client.get(&homepage).send().await;
|
let _ = self.fetch(&homepage).await;
|
||||||
let resp = client.get(url).send().await?;
|
let resp = client.get(url).send().await?;
|
||||||
response = Response::from_wreq(resp).await?;
|
response = Response::from_wreq(resp).await?;
|
||||||
debug!("retried after cookie warmup: status={}", response.status());
|
debug!("retried after cookie warmup: status={}", response.status());
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ pub mod proxy;
|
||||||
pub mod reddit;
|
pub mod reddit;
|
||||||
pub mod sitemap;
|
pub mod sitemap;
|
||||||
pub mod tls;
|
pub mod tls;
|
||||||
|
pub mod url_security;
|
||||||
|
|
||||||
pub use browser::BrowserProfile;
|
pub use browser::BrowserProfile;
|
||||||
pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult};
|
pub use client::{BatchExtractResult, BatchResult, FetchClient, FetchConfig, FetchResult};
|
||||||
|
|
|
||||||
|
|
@ -455,6 +455,8 @@ pub fn build_client(
|
||||||
timeout: Duration,
|
timeout: Duration,
|
||||||
extra_headers: &std::collections::HashMap<String, String>,
|
extra_headers: &std::collections::HashMap<String, String>,
|
||||||
proxy: Option<&str>,
|
proxy: Option<&str>,
|
||||||
|
follow_redirects: bool,
|
||||||
|
max_redirects: u32,
|
||||||
) -> Result<Client, FetchError> {
|
) -> Result<Client, FetchError> {
|
||||||
// SafariIos26 builds its Emulation on top of wreq-util's base instead
|
// SafariIos26 builds its Emulation on top of wreq-util's base instead
|
||||||
// of from scratch. See `safari_ios_emulation` for why.
|
// of from scratch. See `safari_ios_emulation` for why.
|
||||||
|
|
@ -490,7 +492,10 @@ pub fn build_client(
|
||||||
|
|
||||||
let mut builder = Client::builder()
|
let mut builder = Client::builder()
|
||||||
.emulation(emulation)
|
.emulation(emulation)
|
||||||
.redirect(wreq::redirect::Policy::limited(10))
|
.redirect(ssrf_safe_redirect_policy(
|
||||||
|
follow_redirects,
|
||||||
|
max_redirects as usize,
|
||||||
|
))
|
||||||
.cookie_store(true)
|
.cookie_store(true)
|
||||||
.timeout(timeout);
|
.timeout(timeout);
|
||||||
|
|
||||||
|
|
@ -504,3 +509,26 @@ pub fn build_client(
|
||||||
.build()
|
.build()
|
||||||
.map_err(|e| FetchError::Build(e.to_string()))
|
.map_err(|e| FetchError::Build(e.to_string()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn ssrf_safe_redirect_policy(
|
||||||
|
follow_redirects: bool,
|
||||||
|
max_redirects: usize,
|
||||||
|
) -> wreq::redirect::Policy {
|
||||||
|
if !follow_redirects {
|
||||||
|
return wreq::redirect::Policy::none();
|
||||||
|
}
|
||||||
|
|
||||||
|
wreq::redirect::Policy::custom(move |attempt| {
|
||||||
|
if attempt.previous.len() > max_redirects {
|
||||||
|
return attempt.error("too many redirects");
|
||||||
|
}
|
||||||
|
|
||||||
|
attempt.pending(|attempt| async move {
|
||||||
|
let next_url = attempt.uri.to_string();
|
||||||
|
match crate::url_security::validate_public_http_url(&next_url).await {
|
||||||
|
Ok(_) => attempt.follow(),
|
||||||
|
Err(e) => attempt.error(e.to_string()),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
|
||||||
196
crates/webclaw-fetch/src/url_security.rs
Normal file
196
crates/webclaw-fetch/src/url_security.rs
Normal file
|
|
@ -0,0 +1,196 @@
|
||||||
|
//! SSRF guard for every server-side fetch.
|
||||||
|
//!
|
||||||
|
//! Callers may still do cheap parse validation at the edge, but this
|
||||||
|
//! module is the fetch-layer authority because redirects and helper
|
||||||
|
//! fetches also pass through it.
|
||||||
|
|
||||||
|
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
|
||||||
|
|
||||||
|
use tokio::net::lookup_host;
|
||||||
|
use url::{Host, Url};
|
||||||
|
|
||||||
|
use crate::error::FetchError;
|
||||||
|
|
||||||
|
/// Parse a caller-provided URL and require an HTTP(S) host.
|
||||||
|
pub fn validate_http_url(raw: &str) -> Result<Url, FetchError> {
|
||||||
|
let trimmed = raw.trim();
|
||||||
|
if trimmed.is_empty() {
|
||||||
|
return Err(FetchError::InvalidUrl("URL must not be empty".into()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let parsed =
|
||||||
|
Url::parse(trimmed).map_err(|e| FetchError::InvalidUrl(format!("invalid URL: {e}")))?;
|
||||||
|
match parsed.scheme() {
|
||||||
|
"http" | "https" => {}
|
||||||
|
scheme => {
|
||||||
|
return Err(FetchError::InvalidUrl(format!(
|
||||||
|
"scheme '{scheme}' is not allowed, use http:// or https://"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if parsed.host().is_none() {
|
||||||
|
return Err(FetchError::InvalidUrl("URL must include a host".into()));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(parsed)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse, resolve, and reject private/internal destinations.
|
||||||
|
///
|
||||||
|
/// A domain is rejected if any resolved address is private or reserved.
|
||||||
|
/// That is intentionally conservative: mixed public/private DNS answers
|
||||||
|
/// are unsafe for server-side fetching.
|
||||||
|
pub async fn validate_public_http_url(raw: &str) -> Result<Url, FetchError> {
|
||||||
|
let parsed = validate_http_url(raw)?;
|
||||||
|
validate_url_host_is_public(&parsed).await?;
|
||||||
|
Ok(parsed)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn validate_url_host_is_public(url: &Url) -> Result<(), FetchError> {
|
||||||
|
match url.host() {
|
||||||
|
Some(Host::Ipv4(ip)) => reject_blocked_ip(IpAddr::V4(ip)),
|
||||||
|
Some(Host::Ipv6(ip)) => reject_blocked_ip(IpAddr::V6(ip)),
|
||||||
|
Some(Host::Domain(host)) => {
|
||||||
|
let port = url
|
||||||
|
.port_or_known_default()
|
||||||
|
.ok_or_else(|| FetchError::InvalidUrl("URL must include a known port".into()))?;
|
||||||
|
let addrs = lookup_host((host, port))
|
||||||
|
.await
|
||||||
|
.map_err(|e| FetchError::InvalidUrl(format!("failed to resolve host: {e}")))?;
|
||||||
|
|
||||||
|
let mut resolved = false;
|
||||||
|
for addr in addrs {
|
||||||
|
resolved = true;
|
||||||
|
reject_blocked_ip(addr.ip())?;
|
||||||
|
}
|
||||||
|
if !resolved {
|
||||||
|
return Err(FetchError::InvalidUrl(
|
||||||
|
"host did not resolve to any addresses".into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
None => Err(FetchError::InvalidUrl("URL must include a host".into())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn reject_blocked_ip(ip: IpAddr) -> Result<(), FetchError> {
|
||||||
|
if is_blocked_ip(ip) {
|
||||||
|
Err(FetchError::InvalidUrl(
|
||||||
|
"URL resolves to a blocked private or internal address".into(),
|
||||||
|
))
|
||||||
|
} else {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return true for IP ranges that should never be fetched server-side.
|
||||||
|
pub fn is_blocked_ip(ip: IpAddr) -> bool {
|
||||||
|
match ip {
|
||||||
|
IpAddr::V4(ip) => is_blocked_ipv4(ip),
|
||||||
|
IpAddr::V6(ip) => is_blocked_ipv6(ip),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_blocked_ipv4(ip: Ipv4Addr) -> bool {
|
||||||
|
let o = ip.octets();
|
||||||
|
|
||||||
|
ip.is_unspecified()
|
||||||
|
|| ip.is_loopback()
|
||||||
|
|| ip.is_private()
|
||||||
|
|| ip.is_link_local()
|
||||||
|
|| o[0] == 0
|
||||||
|
|| o[0] >= 224
|
||||||
|
|| (o[0] == 100 && (64..=127).contains(&o[1]))
|
||||||
|
|| (o[0] == 192 && o[1] == 0 && o[2] == 0)
|
||||||
|
|| (o[0] == 192 && o[1] == 0 && o[2] == 2)
|
||||||
|
|| (o[0] == 198 && (18..=19).contains(&o[1]))
|
||||||
|
|| (o[0] == 198 && o[1] == 51 && o[2] == 100)
|
||||||
|
|| (o[0] == 203 && o[1] == 0 && o[2] == 113)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_blocked_ipv6(ip: Ipv6Addr) -> bool {
|
||||||
|
let s = ip.segments();
|
||||||
|
|
||||||
|
ip.is_unspecified()
|
||||||
|
|| ip.is_loopback()
|
||||||
|
|| ip.is_multicast()
|
||||||
|
|| (s[0] & 0xfe00) == 0xfc00
|
||||||
|
|| (s[0] & 0xffc0) == 0xfe80
|
||||||
|
|| (s[0] == 0x0064 && s[1] == 0xff9b && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0)
|
||||||
|
|| (s[0] == 0x2001 && s[1] == 0x0db8)
|
||||||
|
|| embedded_ipv4(ip).is_some_and(is_blocked_ipv4)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn embedded_ipv4(ip: Ipv6Addr) -> Option<Ipv4Addr> {
|
||||||
|
let s = ip.segments();
|
||||||
|
|
||||||
|
if s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0xffff {
|
||||||
|
return Some(Ipv4Addr::new(
|
||||||
|
(s[6] >> 8) as u8,
|
||||||
|
s[6] as u8,
|
||||||
|
(s[7] >> 8) as u8,
|
||||||
|
s[7] as u8,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
if s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0 && s[4] == 0 && s[5] == 0 {
|
||||||
|
return Some(Ipv4Addr::new(
|
||||||
|
(s[6] >> 8) as u8,
|
||||||
|
s[6] as u8,
|
||||||
|
(s[7] >> 8) as u8,
|
||||||
|
s[7] as u8,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
|
||||||
|
|
||||||
|
use super::{is_blocked_ip, validate_public_http_url};
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn blocks_ipv4_internal_ranges() {
|
||||||
|
for ip in [
|
||||||
|
Ipv4Addr::new(0, 0, 0, 0),
|
||||||
|
Ipv4Addr::new(10, 0, 0, 1),
|
||||||
|
Ipv4Addr::new(100, 64, 0, 1),
|
||||||
|
Ipv4Addr::new(127, 0, 0, 1),
|
||||||
|
Ipv4Addr::new(169, 254, 169, 254),
|
||||||
|
Ipv4Addr::new(172, 16, 0, 1),
|
||||||
|
Ipv4Addr::new(192, 168, 0, 1),
|
||||||
|
Ipv4Addr::new(198, 18, 0, 1),
|
||||||
|
] {
|
||||||
|
let url = format!("http://{ip}/");
|
||||||
|
assert!(validate_public_http_url(&url).await.is_err(), "{ip}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn blocks_ipv6_internal_ranges() {
|
||||||
|
for ip in [
|
||||||
|
Ipv6Addr::LOCALHOST,
|
||||||
|
Ipv6Addr::UNSPECIFIED,
|
||||||
|
"fc00::1".parse().unwrap(),
|
||||||
|
"fe80::1".parse().unwrap(),
|
||||||
|
"64:ff9b::7f00:1".parse().unwrap(),
|
||||||
|
"::ffff:127.0.0.1".parse().unwrap(),
|
||||||
|
] {
|
||||||
|
assert!(is_blocked_ip(IpAddr::V6(ip)), "{ip}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn allows_public_ip_literals() {
|
||||||
|
assert!(
|
||||||
|
validate_public_http_url("https://93.184.216.34/")
|
||||||
|
.await
|
||||||
|
.is_ok()
|
||||||
|
);
|
||||||
|
assert!(is_blocked_ip(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8))) == false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -13,7 +13,6 @@ use rmcp::model::{Implementation, ServerCapabilities, ServerInfo};
|
||||||
use rmcp::{ServerHandler, tool, tool_handler, tool_router};
|
use rmcp::{ServerHandler, tool, tool_handler, tool_router};
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
use tracing::{error, info, warn};
|
use tracing::{error, info, warn};
|
||||||
use url::Url;
|
|
||||||
|
|
||||||
use webclaw_fetch::cloud::{self, CloudClient, SmartFetchResult};
|
use webclaw_fetch::cloud::{self, CloudClient, SmartFetchResult};
|
||||||
|
|
||||||
|
|
@ -54,19 +53,9 @@ fn parse_browser(browser: Option<&str>) -> webclaw_fetch::BrowserProfile {
|
||||||
|
|
||||||
/// Validate that a URL is non-empty and has an http or https scheme.
|
/// Validate that a URL is non-empty and has an http or https scheme.
|
||||||
fn validate_url(url: &str) -> Result<(), String> {
|
fn validate_url(url: &str) -> Result<(), String> {
|
||||||
if url.is_empty() {
|
webclaw_fetch::url_security::validate_http_url(url)
|
||||||
return Err("Invalid URL: must not be empty".into());
|
.map(|_| ())
|
||||||
}
|
.map_err(|e| format!("Invalid URL: {e}"))
|
||||||
match Url::parse(url) {
|
|
||||||
Ok(parsed) if parsed.scheme() == "http" || parsed.scheme() == "https" => Ok(()),
|
|
||||||
Ok(parsed) => Err(format!(
|
|
||||||
"Invalid URL: scheme '{}' not allowed, must start with http:// or https://",
|
|
||||||
parsed.scheme()
|
|
||||||
)),
|
|
||||||
Err(e) => Err(format!(
|
|
||||||
"Invalid URL: {e}. Must start with http:// or https://"
|
|
||||||
)),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Timeout for local fetch calls (prevents hanging on tarpitting servers).
|
/// Timeout for local fetch calls (prevents hanging on tarpitting servers).
|
||||||
|
|
|
||||||
|
|
@ -70,7 +70,12 @@ impl IntoResponse for ApiError {
|
||||||
|
|
||||||
impl From<webclaw_fetch::FetchError> for ApiError {
|
impl From<webclaw_fetch::FetchError> for ApiError {
|
||||||
fn from(e: webclaw_fetch::FetchError) -> Self {
|
fn from(e: webclaw_fetch::FetchError) -> Self {
|
||||||
Self::Fetch(e.to_string())
|
match e {
|
||||||
|
webclaw_fetch::FetchError::InvalidUrl(msg) => {
|
||||||
|
Self::BadRequest(format!("invalid url: {msg}"))
|
||||||
|
}
|
||||||
|
other => Self::Fetch(other.to_string()),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -75,6 +75,15 @@ async fn main() -> anyhow::Result<()> {
|
||||||
.compact()
|
.compact()
|
||||||
.init();
|
.init();
|
||||||
|
|
||||||
|
if is_unspecified_addr(args.host)
|
||||||
|
&& args.api_key.is_none()
|
||||||
|
&& std::env::var_os("WEBCLAW_ALLOW_OPEN_PUBLIC").is_none()
|
||||||
|
{
|
||||||
|
anyhow::bail!(
|
||||||
|
"refusing to bind 0.0.0.0/[::] without WEBCLAW_API_KEY; set WEBCLAW_API_KEY or WEBCLAW_ALLOW_OPEN_PUBLIC=1 to override"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
let state = AppState::new(args.api_key.clone())?;
|
let state = AppState::new(args.api_key.clone())?;
|
||||||
|
|
||||||
let v1 = Router::new()
|
let v1 = Router::new()
|
||||||
|
|
@ -121,3 +130,10 @@ async fn main() -> anyhow::Result<()> {
|
||||||
axum::serve(listener, app).await?;
|
axum::serve(listener, app).await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn is_unspecified_addr(addr: IpAddr) -> bool {
|
||||||
|
match addr {
|
||||||
|
IpAddr::V4(ip) => ip.is_unspecified(),
|
||||||
|
IpAddr::V6(ip) => ip.is_unspecified(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,14 @@ pub async fn batch(
|
||||||
req.urls.len()
|
req.urls.len()
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
let mut safe_urls = Vec::with_capacity(req.urls.len());
|
||||||
|
for url in &req.urls {
|
||||||
|
safe_urls.push(
|
||||||
|
webclaw_fetch::url_security::validate_public_http_url(url)
|
||||||
|
.await?
|
||||||
|
.to_string(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
let concurrency = req.concurrency.unwrap_or(5).clamp(1, HARD_MAX_CONCURRENCY);
|
let concurrency = req.concurrency.unwrap_or(5).clamp(1, HARD_MAX_CONCURRENCY);
|
||||||
|
|
||||||
|
|
@ -47,7 +55,7 @@ pub async fn batch(
|
||||||
include_raw_html: false,
|
include_raw_html: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
let url_refs: Vec<&str> = req.urls.iter().map(|s| s.as_str()).collect();
|
let url_refs: Vec<&str> = safe_urls.iter().map(|s| s.as_str()).collect();
|
||||||
let results = state
|
let results = state
|
||||||
.fetch()
|
.fetch()
|
||||||
.fetch_and_extract_batch_with_options(&url_refs, concurrency, &options)
|
.fetch_and_extract_batch_with_options(&url_refs, concurrency, &options)
|
||||||
|
|
|
||||||
|
|
@ -52,6 +52,7 @@ pub async fn scrape(
|
||||||
if req.url.trim().is_empty() {
|
if req.url.trim().is_empty() {
|
||||||
return Err(ApiError::bad_request("`url` is required"));
|
return Err(ApiError::bad_request("`url` is required"));
|
||||||
}
|
}
|
||||||
|
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||||
let formats = req.formats.as_vec();
|
let formats = req.formats.as_vec();
|
||||||
|
|
||||||
let options = ExtractionOptions {
|
let options = ExtractionOptions {
|
||||||
|
|
@ -63,11 +64,11 @@ pub async fn scrape(
|
||||||
|
|
||||||
let extraction = state
|
let extraction = state
|
||||||
.fetch()
|
.fetch()
|
||||||
.fetch_and_extract_with_options(&req.url, &options)
|
.fetch_and_extract_with_options(url.as_str(), &options)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let mut body = json!({
|
let mut body = json!({
|
||||||
"url": extraction.metadata.url.clone().unwrap_or_else(|| req.url.clone()),
|
"url": extraction.metadata.url.clone().unwrap_or_else(|| url.to_string()),
|
||||||
"metadata": extraction.metadata,
|
"metadata": extraction.metadata,
|
||||||
});
|
});
|
||||||
let obj = body.as_object_mut().expect("json::object");
|
let obj = body.as_object_mut().expect("json::object");
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue