mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
fix: validate self-host route URLs consistently
This commit is contained in:
parent
eede2f6953
commit
1c9def2fde
8 changed files with 26 additions and 10 deletions
|
|
@ -74,7 +74,16 @@ impl From<webclaw_fetch::FetchError> for ApiError {
|
||||||
webclaw_fetch::FetchError::InvalidUrl(msg) => {
|
webclaw_fetch::FetchError::InvalidUrl(msg) => {
|
||||||
Self::BadRequest(format!("invalid url: {msg}"))
|
Self::BadRequest(format!("invalid url: {msg}"))
|
||||||
}
|
}
|
||||||
other => Self::Fetch(other.to_string()),
|
other => {
|
||||||
|
let msg = other.to_string();
|
||||||
|
if msg.contains("invalid url:")
|
||||||
|
|| msg.contains("blocked private or internal address")
|
||||||
|
{
|
||||||
|
Self::BadRequest(msg)
|
||||||
|
} else {
|
||||||
|
Self::Fetch(msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -21,8 +21,9 @@ pub async fn brand(
|
||||||
if req.url.trim().is_empty() {
|
if req.url.trim().is_empty() {
|
||||||
return Err(ApiError::bad_request("`url` is required"));
|
return Err(ApiError::bad_request("`url` is required"));
|
||||||
}
|
}
|
||||||
|
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||||
|
|
||||||
let fetched = state.fetch().fetch(&req.url).await?;
|
let fetched = state.fetch().fetch(url.as_str()).await?;
|
||||||
let brand = extract_brand(&fetched.html, Some(&fetched.url));
|
let brand = extract_brand(&fetched.html, Some(&fetched.url));
|
||||||
|
|
||||||
Ok(Json(json!({
|
Ok(Json(json!({
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,7 @@ pub async fn crawl(
|
||||||
if req.url.trim().is_empty() {
|
if req.url.trim().is_empty() {
|
||||||
return Err(ApiError::bad_request("`url` is required"));
|
return Err(ApiError::bad_request("`url` is required"));
|
||||||
}
|
}
|
||||||
|
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||||
let max_pages = req.max_pages.unwrap_or(50).min(HARD_MAX_PAGES);
|
let max_pages = req.max_pages.unwrap_or(50).min(HARD_MAX_PAGES);
|
||||||
let max_depth = req.max_depth.unwrap_or(3);
|
let max_depth = req.max_depth.unwrap_or(3);
|
||||||
let concurrency = req.concurrency.unwrap_or(5).min(20);
|
let concurrency = req.concurrency.unwrap_or(5).min(20);
|
||||||
|
|
@ -56,8 +57,8 @@ pub async fn crawl(
|
||||||
cancel_flag: None,
|
cancel_flag: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let crawler = Crawler::new(&req.url, config).map_err(ApiError::from)?;
|
let crawler = Crawler::new(url.as_str(), config).map_err(ApiError::from)?;
|
||||||
let result = crawler.crawl(&req.url, None).await;
|
let result = crawler.crawl(url.as_str(), None).await;
|
||||||
|
|
||||||
let pages: Vec<Value> = result
|
let pages: Vec<Value> = result
|
||||||
.pages
|
.pages
|
||||||
|
|
|
||||||
|
|
@ -75,8 +75,9 @@ pub async fn diff_route(
|
||||||
if req.url.trim().is_empty() {
|
if req.url.trim().is_empty() {
|
||||||
return Err(ApiError::bad_request("`url` is required"));
|
return Err(ApiError::bad_request("`url` is required"));
|
||||||
}
|
}
|
||||||
|
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||||
|
|
||||||
let current = state.fetch().fetch_and_extract(&req.url).await?;
|
let current = state.fetch().fetch_and_extract(url.as_str()).await?;
|
||||||
let previous = req.previous.into_extraction();
|
let previous = req.previous.into_extraction();
|
||||||
let result = diff(&previous, ¤t);
|
let result = diff(&previous, ¤t);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -43,10 +43,11 @@ pub async fn extract(
|
||||||
"either `schema` or `prompt` is required",
|
"either `schema` or `prompt` is required",
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||||
|
|
||||||
// Fetch + extract first so we feed the LLM clean markdown instead of
|
// Fetch + extract first so we feed the LLM clean markdown instead of
|
||||||
// raw HTML. Cheaper tokens, better signal.
|
// raw HTML. Cheaper tokens, better signal.
|
||||||
let extraction = state.fetch().fetch_and_extract(&req.url).await?;
|
let extraction = state.fetch().fetch_and_extract(url.as_str()).await?;
|
||||||
let content = if extraction.content.markdown.trim().is_empty() {
|
let content = if extraction.content.markdown.trim().is_empty() {
|
||||||
extraction.content.plain_text.clone()
|
extraction.content.plain_text.clone()
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
|
|
@ -27,8 +27,9 @@ pub async fn map(
|
||||||
if req.url.trim().is_empty() {
|
if req.url.trim().is_empty() {
|
||||||
return Err(ApiError::bad_request("`url` is required"));
|
return Err(ApiError::bad_request("`url` is required"));
|
||||||
}
|
}
|
||||||
|
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||||
|
|
||||||
let entries = sitemap::discover(state.fetch(), &req.url).await?;
|
let entries = sitemap::discover(state.fetch(), url.as_str()).await?;
|
||||||
|
|
||||||
let body = if req.include_metadata {
|
let body = if req.include_metadata {
|
||||||
json!({
|
json!({
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,7 @@ impl From<ExtractorDispatchError> for ApiError {
|
||||||
match e {
|
match e {
|
||||||
ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound,
|
ExtractorDispatchError::UnknownVertical(_) => ApiError::NotFound,
|
||||||
ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()),
|
ExtractorDispatchError::UrlMismatch { .. } => ApiError::bad_request(e.to_string()),
|
||||||
ExtractorDispatchError::Fetch(f) => ApiError::Fetch(f.to_string()),
|
ExtractorDispatchError::Fetch(f) => ApiError::from(f),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -46,7 +46,8 @@ pub async fn scrape_vertical(
|
||||||
if req.url.trim().is_empty() {
|
if req.url.trim().is_empty() {
|
||||||
return Err(ApiError::bad_request("`url` is required"));
|
return Err(ApiError::bad_request("`url` is required"));
|
||||||
}
|
}
|
||||||
let data = extractors::dispatch_by_name(state.fetch(), &vertical, &req.url).await?;
|
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||||
|
let data = extractors::dispatch_by_name(state.fetch(), &vertical, url.as_str()).await?;
|
||||||
Ok(Json(json!({
|
Ok(Json(json!({
|
||||||
"vertical": vertical,
|
"vertical": vertical,
|
||||||
"url": req.url,
|
"url": req.url,
|
||||||
|
|
|
||||||
|
|
@ -22,8 +22,9 @@ pub async fn summarize_route(
|
||||||
if req.url.trim().is_empty() {
|
if req.url.trim().is_empty() {
|
||||||
return Err(ApiError::bad_request("`url` is required"));
|
return Err(ApiError::bad_request("`url` is required"));
|
||||||
}
|
}
|
||||||
|
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||||
|
|
||||||
let extraction = state.fetch().fetch_and_extract(&req.url).await?;
|
let extraction = state.fetch().fetch_and_extract(url.as_str()).await?;
|
||||||
let content = if extraction.content.markdown.trim().is_empty() {
|
let content = if extraction.content.markdown.trim().is_empty() {
|
||||||
extraction.content.plain_text.clone()
|
extraction.content.plain_text.clone()
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue