mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-23 02:48:06 +02:00
fix: harden fetch URL validation
This commit is contained in:
parent
23544f8fac
commit
bdf81fe6bf
10 changed files with 284 additions and 27 deletions
|
|
@ -199,6 +199,8 @@ impl FetchClient {
|
|||
config.timeout,
|
||||
&config.headers,
|
||||
config.proxy.as_deref(),
|
||||
config.follow_redirects,
|
||||
config.max_redirects,
|
||||
)
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
|
@ -218,7 +220,14 @@ impl FetchClient {
|
|||
.iter()
|
||||
.map(|proxy| {
|
||||
let v = *variants.choose(&mut rng).unwrap();
|
||||
crate::tls::build_client(v, config.timeout, &config.headers, Some(proxy))
|
||||
crate::tls::build_client(
|
||||
v,
|
||||
config.timeout,
|
||||
&config.headers,
|
||||
Some(proxy),
|
||||
config.follow_redirects,
|
||||
config.max_redirects,
|
||||
)
|
||||
})
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
|
|
@ -379,6 +388,8 @@ impl FetchClient {
|
|||
url: &str,
|
||||
extra: &[(&str, &str)],
|
||||
) -> Result<FetchResult, FetchError> {
|
||||
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
|
||||
let url = parsed_url.as_str();
|
||||
let start = Instant::now();
|
||||
let client = self.pick_client(url);
|
||||
|
||||
|
|
@ -463,13 +474,17 @@ impl FetchClient {
|
|||
url: &str,
|
||||
options: &webclaw_core::ExtractionOptions,
|
||||
) -> Result<webclaw_core::ExtractionResult, FetchError> {
|
||||
let parsed_url = crate::url_security::validate_public_http_url(url).await?;
|
||||
let url = parsed_url.as_str();
|
||||
|
||||
// Reddit fallback: use their JSON API to get post + full comment tree.
|
||||
if crate::reddit::is_reddit_url(url) {
|
||||
let json_url = crate::reddit::json_url(url);
|
||||
let json_url = crate::url_security::validate_public_http_url(&json_url).await?;
|
||||
debug!("reddit detected, fetching {json_url}");
|
||||
|
||||
let client = self.pick_client(url);
|
||||
let resp = client.get(&json_url).send().await?;
|
||||
let resp = client.get(json_url.as_str()).send().await?;
|
||||
let response = Response::from_wreq(resp).await?;
|
||||
if response.is_success() {
|
||||
let bytes = response.body();
|
||||
|
|
@ -491,7 +506,7 @@ impl FetchClient {
|
|||
&& let Some(homepage) = extract_homepage(url)
|
||||
{
|
||||
debug!("challenge detected, warming cookies via {homepage}");
|
||||
let _ = client.get(&homepage).send().await;
|
||||
let _ = self.fetch(&homepage).await;
|
||||
let resp = client.get(url).send().await?;
|
||||
response = Response::from_wreq(resp).await?;
|
||||
debug!("retried after cookie warmup: status={}", response.status());
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue