fix: validate self-host route URLs consistently

This commit is contained in:
Valerio 2026-05-04 14:30:06 +02:00
parent eede2f6953
commit 1c9def2fde
8 changed files with 26 additions and 10 deletions

View file

@ -74,7 +74,16 @@ impl From<webclaw_fetch::FetchError> for ApiError {
webclaw_fetch::FetchError::InvalidUrl(msg) => {
Self::BadRequest(format!("invalid url: {msg}"))
}
other => Self::Fetch(other.to_string()),
other => {
let msg = other.to_string();
if msg.contains("invalid url:")
|| msg.contains("blocked private or internal address")
{
Self::BadRequest(msg)
} else {
Self::Fetch(msg)
}
}
}
}
}

View file

@ -21,8 +21,9 @@ pub async fn brand(
if req.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
let fetched = state.fetch().fetch(&req.url).await?;
let fetched = state.fetch().fetch(url.as_str()).await?;
let brand = extract_brand(&fetched.html, Some(&fetched.url));
Ok(Json(json!({

View file

@ -36,6 +36,7 @@ pub async fn crawl(
if req.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
let max_pages = req.max_pages.unwrap_or(50).min(HARD_MAX_PAGES);
let max_depth = req.max_depth.unwrap_or(3);
let concurrency = req.concurrency.unwrap_or(5).min(20);
@ -56,8 +57,8 @@ pub async fn crawl(
cancel_flag: None,
};
let crawler = Crawler::new(&req.url, config).map_err(ApiError::from)?;
let result = crawler.crawl(&req.url, None).await;
let crawler = Crawler::new(url.as_str(), config).map_err(ApiError::from)?;
let result = crawler.crawl(url.as_str(), None).await;
let pages: Vec<Value> = result
.pages

View file

@ -75,8 +75,9 @@ pub async fn diff_route(
if req.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
let current = state.fetch().fetch_and_extract(&req.url).await?;
let current = state.fetch().fetch_and_extract(url.as_str()).await?;
let previous = req.previous.into_extraction();
let result = diff(&previous, &current);

View file

@ -43,10 +43,11 @@ pub async fn extract(
"either `schema` or `prompt` is required",
));
}
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
// Fetch + extract first so we feed the LLM clean markdown instead of
// raw HTML. Cheaper tokens, better signal.
let extraction = state.fetch().fetch_and_extract(&req.url).await?;
let extraction = state.fetch().fetch_and_extract(url.as_str()).await?;
let content = if extraction.content.markdown.trim().is_empty() {
extraction.content.plain_text.clone()
} else {

View file

@ -27,8 +27,9 @@ pub async fn map(
if req.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
let entries = sitemap::discover(state.fetch(), &req.url).await?;
let entries = sitemap::discover(state.fetch(), url.as_str()).await?;
let body = if req.include_metadata {
json!({

View file

@ -25,7 +25,7 @@ impl From<ExtractorDispatchError> for ApiError {
match e {
ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound,
ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()),
ExtractorDispatchError::Fetch(f) => ApiError::Fetch(f.to_string()),
ExtractorDispatchError::Fetch(f) => ApiError::from(f),
}
}
}
@ -46,7 +46,8 @@ pub async fn scrape_vertical(
if req.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
let data = extractors::dispatch_by_name(state.fetch(), &vertical, &req.url).await?;
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
let data = extractors::dispatch_by_name(state.fetch(), &vertical, url.as_str()).await?;
Ok(Json(json!({
"vertical": vertical,
"url": req.url,

View file

@ -22,8 +22,9 @@ pub async fn summarize_route(
if req.url.trim().is_empty() {
return Err(ApiError::bad_request("`url` is required"));
}
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
let extraction = state.fetch().fetch_and_extract(&req.url).await?;
let extraction = state.fetch().fetch_and_extract(url.as_str()).await?;
let content = if extraction.content.markdown.trim().is_empty() {
extraction.content.plain_text.clone()
} else {