fix: validate self-host route URLs consistently

2026-06-07 22:15:12 +02:00 · 2026-05-04 14:30:06 +02:00 · 2026-05-04 14:30:06 +02:00 · 1c9def2fde
commit 1c9def2fde
parent eede2f6953
8 changed files with 26 additions and 10 deletions
--- a/crates/webclaw-server/src/error.rs
+++ b/crates/webclaw-server/src/error.rs
@ -74,7 +74,16 @@ impl From<webclaw_fetch::FetchError> for ApiError {
            webclaw_fetch::FetchError::InvalidUrl(msg) => {
                Self::BadRequest(format!("invalid url: {msg}"))
            }
-            other => Self::Fetch(other.to_string()),
+            other => {
+                let msg = other.to_string();
+                if msg.contains("invalid url:")
+                    || msg.contains("blocked private or internal address")
+                {
+                    Self::BadRequest(msg)
+                } else {
+                    Self::Fetch(msg)
+                }
+            }
        }
    }
 }
--- a/crates/webclaw-server/src/routes/brand.rs
+++ b/crates/webclaw-server/src/routes/brand.rs
@ -21,8 +21,9 @@ pub async fn brand(
    if req.url.trim().is_empty() {
        return Err(ApiError::bad_request("`url` is required"));
    }
+    let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;

-    let fetched = state.fetch().fetch(&req.url).await?;
+    let fetched = state.fetch().fetch(url.as_str()).await?;
    let brand = extract_brand(&fetched.html, Some(&fetched.url));

    Ok(Json(json!({
--- a/crates/webclaw-server/src/routes/crawl.rs
+++ b/crates/webclaw-server/src/routes/crawl.rs
@ -36,6 +36,7 @@ pub async fn crawl(
    if req.url.trim().is_empty() {
        return Err(ApiError::bad_request("`url` is required"));
    }
+    let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
    let max_pages = req.max_pages.unwrap_or(50).min(HARD_MAX_PAGES);
    let max_depth = req.max_depth.unwrap_or(3);
    let concurrency = req.concurrency.unwrap_or(5).min(20);
@ -56,8 +57,8 @@ pub async fn crawl(
        cancel_flag: None,
    };

-    let crawler = Crawler::new(&req.url, config).map_err(ApiError::from)?;
-    let result = crawler.crawl(&req.url, None).await;
+    let crawler = Crawler::new(url.as_str(), config).map_err(ApiError::from)?;
+    let result = crawler.crawl(url.as_str(), None).await;

    let pages: Vec<Value> = result
        .pages
--- a/crates/webclaw-server/src/routes/diff.rs
+++ b/crates/webclaw-server/src/routes/diff.rs
@ -75,8 +75,9 @@ pub async fn diff_route(
    if req.url.trim().is_empty() {
        return Err(ApiError::bad_request("`url` is required"));
    }
+    let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;

-    let current = state.fetch().fetch_and_extract(&req.url).await?;
+    let current = state.fetch().fetch_and_extract(url.as_str()).await?;
    let previous = req.previous.into_extraction();
    let result = diff(&previous, &current);

--- a/crates/webclaw-server/src/routes/extract.rs
+++ b/crates/webclaw-server/src/routes/extract.rs
@ -43,10 +43,11 @@ pub async fn extract(
            "either `schema` or `prompt` is required",
        ));
    }
+    let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;

    // Fetch + extract first so we feed the LLM clean markdown instead of
    // raw HTML. Cheaper tokens, better signal.
-    let extraction = state.fetch().fetch_and_extract(&req.url).await?;
+    let extraction = state.fetch().fetch_and_extract(url.as_str()).await?;
    let content = if extraction.content.markdown.trim().is_empty() {
        extraction.content.plain_text.clone()
    } else {
--- a/crates/webclaw-server/src/routes/map.rs
+++ b/crates/webclaw-server/src/routes/map.rs
@ -27,8 +27,9 @@ pub async fn map(
    if req.url.trim().is_empty() {
        return Err(ApiError::bad_request("`url` is required"));
    }
+    let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;

-    let entries = sitemap::discover(state.fetch(), &req.url).await?;
+    let entries = sitemap::discover(state.fetch(), url.as_str()).await?;

    let body = if req.include_metadata {
        json!({
--- a/crates/webclaw-server/src/routes/structured.rs
+++ b/crates/webclaw-server/src/routes/structured.rs
@ -25,7 +25,7 @@ impl From<ExtractorDispatchError> for ApiError {
        match e {
            ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound,
            ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()),
-            ExtractorDispatchError::Fetch(f) => ApiError::Fetch(f.to_string()),
+            ExtractorDispatchError::Fetch(f) => ApiError::from(f),
        }
    }
 }
@ -46,7 +46,8 @@ pub async fn scrape_vertical(
    if req.url.trim().is_empty() {
        return Err(ApiError::bad_request("`url` is required"));
    }
-    let data = extractors::dispatch_by_name(state.fetch(), &vertical, &req.url).await?;
+    let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
+    let data = extractors::dispatch_by_name(state.fetch(), &vertical, url.as_str()).await?;
    Ok(Json(json!({
        "vertical": vertical,
        "url": req.url,
--- a/crates/webclaw-server/src/routes/summarize.rs
+++ b/crates/webclaw-server/src/routes/summarize.rs
@ -22,8 +22,9 @@ pub async fn summarize_route(
    if req.url.trim().is_empty() {
        return Err(ApiError::bad_request("`url` is required"));
    }
+    let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;

-    let extraction = state.fetch().fetch_and_extract(&req.url).await?;
+    let extraction = state.fetch().fetch_and_extract(url.as_str()).await?;
    let content = if extraction.content.markdown.trim().is_empty() {
        extraction.content.plain_text.clone()
    } else {