diff --git a/crates/webclaw-server/src/error.rs b/crates/webclaw-server/src/error.rs index 7f1d36e..a63848f 100644 --- a/crates/webclaw-server/src/error.rs +++ b/crates/webclaw-server/src/error.rs @@ -74,7 +74,16 @@ impl From for ApiError { webclaw_fetch::FetchError::InvalidUrl(msg) => { Self::BadRequest(format!("invalid url: {msg}")) } - other => Self::Fetch(other.to_string()), + other => { + let msg = other.to_string(); + if msg.contains("invalid url:") + || msg.contains("blocked private or internal address") + { + Self::BadRequest(msg) + } else { + Self::Fetch(msg) + } + } } } } diff --git a/crates/webclaw-server/src/routes/brand.rs b/crates/webclaw-server/src/routes/brand.rs index 908976a..f3f6a43 100644 --- a/crates/webclaw-server/src/routes/brand.rs +++ b/crates/webclaw-server/src/routes/brand.rs @@ -21,8 +21,9 @@ pub async fn brand( if req.url.trim().is_empty() { return Err(ApiError::bad_request("`url` is required")); } + let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?; - let fetched = state.fetch().fetch(&req.url).await?; + let fetched = state.fetch().fetch(url.as_str()).await?; let brand = extract_brand(&fetched.html, Some(&fetched.url)); Ok(Json(json!({ diff --git a/crates/webclaw-server/src/routes/crawl.rs b/crates/webclaw-server/src/routes/crawl.rs index 4d15195..9ea484c 100644 --- a/crates/webclaw-server/src/routes/crawl.rs +++ b/crates/webclaw-server/src/routes/crawl.rs @@ -36,6 +36,7 @@ pub async fn crawl( if req.url.trim().is_empty() { return Err(ApiError::bad_request("`url` is required")); } + let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?; let max_pages = req.max_pages.unwrap_or(50).min(HARD_MAX_PAGES); let max_depth = req.max_depth.unwrap_or(3); let concurrency = req.concurrency.unwrap_or(5).min(20); @@ -56,8 +57,8 @@ pub async fn crawl( cancel_flag: None, }; - let crawler = Crawler::new(&req.url, config).map_err(ApiError::from)?; - let result = crawler.crawl(&req.url, None).await; + let crawler = Crawler::new(url.as_str(), config).map_err(ApiError::from)?; + let result = crawler.crawl(url.as_str(), None).await; let pages: Vec = result .pages diff --git a/crates/webclaw-server/src/routes/diff.rs b/crates/webclaw-server/src/routes/diff.rs index e4e038d..b0706fb 100644 --- a/crates/webclaw-server/src/routes/diff.rs +++ b/crates/webclaw-server/src/routes/diff.rs @@ -75,8 +75,9 @@ pub async fn diff_route( if req.url.trim().is_empty() { return Err(ApiError::bad_request("`url` is required")); } + let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?; - let current = state.fetch().fetch_and_extract(&req.url).await?; + let current = state.fetch().fetch_and_extract(url.as_str()).await?; let previous = req.previous.into_extraction(); let result = diff(&previous, ¤t); diff --git a/crates/webclaw-server/src/routes/extract.rs b/crates/webclaw-server/src/routes/extract.rs index 05b8909..55b34a0 100644 --- a/crates/webclaw-server/src/routes/extract.rs +++ b/crates/webclaw-server/src/routes/extract.rs @@ -43,10 +43,11 @@ pub async fn extract( "either `schema` or `prompt` is required", )); } + let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?; // Fetch + extract first so we feed the LLM clean markdown instead of // raw HTML. Cheaper tokens, better signal. - let extraction = state.fetch().fetch_and_extract(&req.url).await?; + let extraction = state.fetch().fetch_and_extract(url.as_str()).await?; let content = if extraction.content.markdown.trim().is_empty() { extraction.content.plain_text.clone() } else { diff --git a/crates/webclaw-server/src/routes/map.rs b/crates/webclaw-server/src/routes/map.rs index 846183a..6daec69 100644 --- a/crates/webclaw-server/src/routes/map.rs +++ b/crates/webclaw-server/src/routes/map.rs @@ -27,8 +27,9 @@ pub async fn map( if req.url.trim().is_empty() { return Err(ApiError::bad_request("`url` is required")); } + let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?; - let entries = sitemap::discover(state.fetch(), &req.url).await?; + let entries = sitemap::discover(state.fetch(), url.as_str()).await?; let body = if req.include_metadata { json!({ diff --git a/crates/webclaw-server/src/routes/structured.rs b/crates/webclaw-server/src/routes/structured.rs index c9cdc1a..9c10b67 100644 --- a/crates/webclaw-server/src/routes/structured.rs +++ b/crates/webclaw-server/src/routes/structured.rs @@ -25,7 +25,7 @@ impl From for ApiError { match e { ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound, ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()), - ExtractorDispatchError::Fetch(f) => ApiError::Fetch(f.to_string()), + ExtractorDispatchError::Fetch(f) => ApiError::from(f), } } } @@ -46,7 +46,8 @@ pub async fn scrape_vertical( if req.url.trim().is_empty() { return Err(ApiError::bad_request("`url` is required")); } - let data = extractors::dispatch_by_name(state.fetch(), &vertical, &req.url).await?; + let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?; + let data = extractors::dispatch_by_name(state.fetch(), &vertical, url.as_str()).await?; Ok(Json(json!({ "vertical": vertical, "url": req.url, diff --git a/crates/webclaw-server/src/routes/summarize.rs b/crates/webclaw-server/src/routes/summarize.rs index b967f1f..6b645ab 100644 --- a/crates/webclaw-server/src/routes/summarize.rs +++ b/crates/webclaw-server/src/routes/summarize.rs @@ -22,8 +22,9 @@ pub async fn summarize_route( if req.url.trim().is_empty() { return Err(ApiError::bad_request("`url` is required")); } + let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?; - let extraction = state.fetch().fetch_and_extract(&req.url).await?; + let extraction = state.fetch().fetch_and_extract(url.as_str()).await?; let content = if extraction.content.markdown.trim().is_empty() { extraction.content.plain_text.clone() } else {