mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-06 22:05:13 +02:00
fix: harden fetch URL validation
This commit is contained in:
parent
23544f8fac
commit
bdf81fe6bf
10 changed files with 284 additions and 27 deletions
|
|
@ -70,7 +70,12 @@ impl IntoResponse for ApiError {
|
|||
|
||||
impl From<webclaw_fetch::FetchError> for ApiError {
|
||||
fn from(e: webclaw_fetch::FetchError) -> Self {
|
||||
Self::Fetch(e.to_string())
|
||||
match e {
|
||||
webclaw_fetch::FetchError::InvalidUrl(msg) => {
|
||||
Self::BadRequest(format!("invalid url: {msg}"))
|
||||
}
|
||||
other => Self::Fetch(other.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -75,6 +75,15 @@ async fn main() -> anyhow::Result<()> {
|
|||
.compact()
|
||||
.init();
|
||||
|
||||
if is_unspecified_addr(args.host)
|
||||
&& args.api_key.is_none()
|
||||
&& std::env::var_os("WEBCLAW_ALLOW_OPEN_PUBLIC").is_none()
|
||||
{
|
||||
anyhow::bail!(
|
||||
"refusing to bind 0.0.0.0/[::] without WEBCLAW_API_KEY; set WEBCLAW_API_KEY or WEBCLAW_ALLOW_OPEN_PUBLIC=1 to override"
|
||||
);
|
||||
}
|
||||
|
||||
let state = AppState::new(args.api_key.clone())?;
|
||||
|
||||
let v1 = Router::new()
|
||||
|
|
@ -121,3 +130,10 @@ async fn main() -> anyhow::Result<()> {
|
|||
axum::serve(listener, app).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_unspecified_addr(addr: IpAddr) -> bool {
|
||||
match addr {
|
||||
IpAddr::V4(ip) => ip.is_unspecified(),
|
||||
IpAddr::V6(ip) => ip.is_unspecified(),
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -37,6 +37,14 @@ pub async fn batch(
|
|||
req.urls.len()
|
||||
)));
|
||||
}
|
||||
let mut safe_urls = Vec::with_capacity(req.urls.len());
|
||||
for url in &req.urls {
|
||||
safe_urls.push(
|
||||
webclaw_fetch::url_security::validate_public_http_url(url)
|
||||
.await?
|
||||
.to_string(),
|
||||
);
|
||||
}
|
||||
|
||||
let concurrency = req.concurrency.unwrap_or(5).clamp(1, HARD_MAX_CONCURRENCY);
|
||||
|
||||
|
|
@ -47,7 +55,7 @@ pub async fn batch(
|
|||
include_raw_html: false,
|
||||
};
|
||||
|
||||
let url_refs: Vec<&str> = req.urls.iter().map(|s| s.as_str()).collect();
|
||||
let url_refs: Vec<&str> = safe_urls.iter().map(|s| s.as_str()).collect();
|
||||
let results = state
|
||||
.fetch()
|
||||
.fetch_and_extract_batch_with_options(&url_refs, concurrency, &options)
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ pub async fn scrape(
|
|||
if req.url.trim().is_empty() {
|
||||
return Err(ApiError::bad_request("`url` is required"));
|
||||
}
|
||||
let url = webclaw_fetch::url_security::validate_public_http_url(&req.url).await?;
|
||||
let formats = req.formats.as_vec();
|
||||
|
||||
let options = ExtractionOptions {
|
||||
|
|
@ -63,11 +64,11 @@ pub async fn scrape(
|
|||
|
||||
let extraction = state
|
||||
.fetch()
|
||||
.fetch_and_extract_with_options(&req.url, &options)
|
||||
.fetch_and_extract_with_options(url.as_str(), &options)
|
||||
.await?;
|
||||
|
||||
let mut body = json!({
|
||||
"url": extraction.metadata.url.clone().unwrap_or_else(|| req.url.clone()),
|
||||
"url": extraction.metadata.url.clone().unwrap_or_else(|| url.to_string()),
|
||||
"metadata": extraction.metadata,
|
||||
});
|
||||
let obj = body.as_object_mut().expect("json::object");
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue