mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-07 22:15:12 +02:00
fix: validate self-host route URLs consistently
This commit is contained in:
parent
eede2f6953
commit
1c9def2fde
8 changed files with 26 additions and 10 deletions
|
|
@ -74,7 +74,16 @@ impl From<webclaw_fetch::FetchError> for ApiError {
|
|||
webclaw_fetch::FetchError::InvalidUrl(msg) => {
|
||||
Self::BadRequest(format!("invalid url: {msg}"))
|
||||
}
|
||||
other => Self::Fetch(other.to_string()),
|
||||
other => {
|
||||
let msg = other.to_string();
|
||||
if msg.contains("invalid url:")
|
||||
|| msg.contains("blocked private or internal address")
|
||||
{
|
||||
Self::BadRequest(msg)
|
||||
} else {
|
||||
Self::Fetch(msg)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -21,8 +21,9 @@ pub async fn brand(
|
|||
if req.url.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||
|
||||
let fetched = state.fetch().fetch(&req.url).await?;
|
||||
let fetched = state.fetch().fetch(url.as_str()).await?;
|
||||
let brand = extract_brand(&fetched.html, Some(&fetched.url));
|
||||
|
||||
Ok(Json(json!({
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ pub async fn crawl(
|
|||
if req.url.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||
let max_pages = req.max_pages.unwrap_or(50).min(HARD_MAX_PAGES);
|
||||
let max_depth = req.max_depth.unwrap_or(3);
|
||||
let concurrency = req.concurrency.unwrap_or(5).min(20);
|
||||
|
|
@ -56,8 +57,8 @@ pub async fn crawl(
|
|||
cancel_flag: None,
|
||||
};
|
||||
|
||||
let crawler = Crawler::new(&req.url, config).map_err(ApiError::from)?;
|
||||
let result = crawler.crawl(&req.url, None).await;
|
||||
let crawler = Crawler::new(url.as_str(), config).map_err(ApiError::from)?;
|
||||
let result = crawler.crawl(url.as_str(), None).await;
|
||||
|
||||
let pages: Vec<Value> = result
|
||||
.pages
|
||||
|
|
|
|||
|
|
@ -75,8 +75,9 @@ pub async fn diff_route(
|
|||
if req.url.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||
|
||||
let current = state.fetch().fetch_and_extract(&req.url).await?;
|
||||
let current = state.fetch().fetch_and_extract(url.as_str()).await?;
|
||||
let previous = req.previous.into_extraction();
|
||||
let result = diff(&previous, ¤t);
|
||||
|
||||
|
|
|
|||
|
|
@ -43,10 +43,11 @@ pub async fn extract(
|
|||
"either `schema` or `prompt` is required",
|
||||
));
|
||||
}
|
||||
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||
|
||||
// Fetch + extract first so we feed the LLM clean markdown instead of
|
||||
// raw HTML. Cheaper tokens, better signal.
|
||||
let extraction = state.fetch().fetch_and_extract(&req.url).await?;
|
||||
let extraction = state.fetch().fetch_and_extract(url.as_str()).await?;
|
||||
let content = if extraction.content.markdown.trim().is_empty() {
|
||||
extraction.content.plain_text.clone()
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -27,8 +27,9 @@ pub async fn map(
|
|||
if req.url.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||
|
||||
let entries = sitemap::discover(state.fetch(), &req.url).await?;
|
||||
let entries = sitemap::discover(state.fetch(), url.as_str()).await?;
|
||||
|
||||
let body = if req.include_metadata {
|
||||
json!({
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ impl From<ExtractorDispatchError> for ApiError {
|
|||
match e {
|
||||
ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound,
|
||||
ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()),
|
||||
ExtractorDispatchError::Fetch(f) => ApiError::Fetch(f.to_string()),
|
||||
ExtractorDispatchError::Fetch(f) => ApiError::from(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -46,7 +46,8 @@ pub async fn scrape_vertical(
|
|||
if req.url.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
let data = extractors::dispatch_by_name(state.fetch(), &vertical, &req.url).await?;
|
||||
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||
let data = extractors::dispatch_by_name(state.fetch(), &vertical, url.as_str()).await?;
|
||||
Ok(Json(json!({
|
||||
"vertical": vertical,
|
||||
"url": req.url,
|
||||
|
|
|
|||
|
|
@ -22,8 +22,9 @@ pub async fn summarize_route(
|
|||
if req.url.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||
|
||||
let extraction = state.fetch().fetch_and_extract(&req.url).await?;
|
||||
let extraction = state.fetch().fetch_and_extract(url.as_str()).await?;
|
||||
let content = if extraction.content.markdown.trim().is_empty() {
|
||||
extraction.content.plain_text.clone()
|
||||
} else {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue