/// HTTP client with browser TLS fingerprint impersonation. /// Uses wreq (BoringSSL) for browser-grade TLS + HTTP/2 fingerprinting. /// Supports single and batch operations with proxy rotation. /// Automatically detects PDF responses and extracts text via webclaw-pdf. /// /// Two proxy modes: /// - **Static**: single proxy (or none) baked into pre-built clients at construction. /// - **Rotating**: pre-built pool of clients, each with a different proxy + fingerprint. /// Same-host URLs are routed to the same client for HTTP/2 connection reuse. use std::collections::HashMap; use std::hash::{Hash, Hasher}; use std::sync::Arc; use std::time::{Duration, Instant}; use futures_util::StreamExt; use rand::seq::SliceRandom; use tokio::sync::Semaphore; use tracing::{debug, instrument, warn}; use webclaw_pdf::PdfMode; use crate::browser::{self, BrowserProfile, BrowserVariant}; use crate::error::FetchError; /// Configuration for building a [`FetchClient`]. #[derive(Debug, Clone)] pub struct FetchConfig { pub browser: BrowserProfile, /// Single proxy URL. Used when `proxy_pool` is empty. pub proxy: Option, /// Pool of proxy URLs to rotate through. /// When non-empty, each proxy gets a pre-built client with a /// random browser fingerprint. Same-host URLs reuse the same client /// for HTTP/2 connection multiplexing. pub proxy_pool: Vec, pub timeout: Duration, pub follow_redirects: bool, pub max_redirects: u32, pub headers: HashMap, pub pdf_mode: PdfMode, } impl Default for FetchConfig { fn default() -> Self { Self { browser: BrowserProfile::Chrome, proxy: None, proxy_pool: Vec::new(), timeout: Duration::from_secs(12), follow_redirects: true, max_redirects: 10, headers: HashMap::from([("Accept-Language".to_string(), "en-US,en;q=0.9".to_string())]), pdf_mode: PdfMode::default(), } } } /// Result of a successful fetch. #[derive(Debug, Clone)] pub struct FetchResult { pub html: String, pub status: u16, /// Final URL after any redirects. pub url: String, pub headers: http::HeaderMap, pub elapsed: Duration, } /// Result for a single URL in a batch fetch operation. #[derive(Debug)] pub struct BatchResult { pub url: String, pub result: Result, } /// Result for a single URL in a batch fetch-and-extract operation. #[derive(Debug)] pub struct BatchExtractResult { pub url: String, pub result: Result, } /// Buffered response that owns its body. Provides the same sync API /// that webclaw-http::Response used to provide. struct Response { status: u16, url: String, headers: http::HeaderMap, body: bytes::Bytes, } /// Maximum fetched body size. A single 50 MB HTML document is already /// several orders of magnitude past any realistic page; larger responses /// are either malicious (log bomb, zip-bomb decompressed) or streaming /// bugs. Caps the blast radius of the HTML → markdown conversion /// downstream (which could otherwise allocate multiple full-size Strings /// per page in collapse_whitespace + strip_markdown). const MAX_BODY_BYTES: u64 = 50 * 1024 * 1024; /// Running decompression-bomb guard: reject as soon as the bytes already /// buffered plus the next decompressed chunk would cross [`MAX_BODY_BYTES`]. /// Saturating arithmetic so a huge chunk length can't wrap the sum. fn check_body_ceiling(buffered: usize, next_chunk: usize) -> Result<(), FetchError> { let total = (buffered as u64).saturating_add(next_chunk as u64); if total > MAX_BODY_BYTES { return Err(FetchError::BodyDecode(format!( "response body exceeds cap {MAX_BODY_BYTES} bytes (decompressed)" ))); } Ok(()) } impl Response { /// Buffer a wreq response into an owned Response. /// /// Rejects bodies that advertise a Content-Length beyond /// [`MAX_BODY_BYTES`] before we pay any allocation, then streams the /// body chunk-by-chunk while enforcing a running ceiling. `chunk()` /// yields *post-decompression* bytes (gzip/brotli/zstd/deflate are /// negotiated), so a tiny compressed payload that inflates to /// gigabytes is aborted as soon as the accumulated size crosses the /// cap — it never gets fully buffered in memory. async fn from_wreq(resp: wreq::Response) -> Result { if let Some(len) = resp.content_length() && len > MAX_BODY_BYTES { return Err(FetchError::BodyDecode(format!( "response body {len} bytes exceeds cap {MAX_BODY_BYTES}" ))); } let status = resp.status().as_u16(); let url = resp.uri().to_string(); let headers = resp.headers().clone(); // wreq 6.0.0-rc.29 dropped `Response::chunk()`. Stream post-decompression // bytes via `bytes_stream()` and keep enforcing the running ceiling so a // compression bomb is aborted before it is fully buffered in memory. let mut buf = bytes::BytesMut::new(); let mut stream = resp.bytes_stream(); while let Some(chunk) = stream.next().await { let chunk = chunk.map_err(|e| FetchError::BodyDecode(e.to_string()))?; check_body_ceiling(buf.len(), chunk.len())?; buf.extend_from_slice(&chunk); } Ok(Self { status, url, headers, body: buf.freeze(), }) } fn status(&self) -> u16 { self.status } fn url(&self) -> &str { &self.url } fn headers(&self) -> &http::HeaderMap { &self.headers } fn body(&self) -> &[u8] { &self.body } fn text(&self) -> std::borrow::Cow<'_, str> { String::from_utf8_lossy(&self.body) } fn into_text(self) -> String { String::from_utf8_lossy(&self.body).into_owned() } } /// Internal representation of the client pool strategy. enum ClientPool { /// Pre-built clients with a fixed proxy (or no proxy). /// Fingerprint rotation still works via the pool when `random` is true. Static { clients: Vec, random: bool, }, /// Pre-built pool of clients, each with a different proxy + fingerprint. /// Requests pick a client deterministically by host for HTTP/2 connection reuse. Rotating { clients: Vec }, } /// HTTP client with browser TLS + HTTP/2 fingerprinting via wreq. /// /// Operates in two modes: /// - **Static pool**: pre-built clients, optionally with fingerprint rotation. /// Used when no `proxy_pool` is configured. Fast (no per-request construction). /// - **Rotating pool**: pre-built clients, one per proxy in the pool. /// Same-host URLs are routed to the same client for HTTP/2 multiplexing. pub struct FetchClient { pool: ClientPool, pdf_mode: PdfMode, /// Optional cloud-fallback client. Extractors that need to /// escalate past bot protection call `client.cloud()` to get this /// out. Stored as `Arc` so cloning a `FetchClient` (common in /// axum state) doesn't clone the underlying reqwest pool. cloud: Option>, } impl FetchClient { /// Build a new client from config. pub fn new(config: FetchConfig) -> Result { let variants = collect_variants(&config.browser); let pdf_mode = config.pdf_mode.clone(); let pool = if config.proxy_pool.is_empty() { let clients = variants .into_iter() .map(|v| { crate::tls::build_client( v, config.timeout, &config.headers, config.proxy.as_deref(), config.follow_redirects, config.max_redirects, ) }) .collect::, _>>()?; let random = matches!(config.browser, BrowserProfile::Random); debug!( count = clients.len(), random, "fetch client ready (static pool)" ); ClientPool::Static { clients, random } } else { let mut rng = rand::thread_rng(); let clients = config .proxy_pool .iter() .map(|proxy| { let v = *variants.choose(&mut rng).unwrap(); crate::tls::build_client( v, config.timeout, &config.headers, Some(proxy), config.follow_redirects, config.max_redirects, ) }) .collect::, _>>()?; debug!( clients = clients.len(), "fetch client ready (pre-built rotating pool)" ); ClientPool::Rotating { clients } }; Ok(Self { pool, pdf_mode, cloud: None, }) } /// Attach a cloud-fallback client. Returns `self` so it composes in /// a builder-ish way: /// /// ```ignore /// let client = FetchClient::new(config)? /// .with_cloud(CloudClient::from_env()?); /// ``` /// /// Extractors that can escalate past bot protection will call /// `client.cloud()` internally. Sets the field regardless of /// whether `cloud` is configured to bypass anything specific — /// attachment is cheap (just wraps in `Arc`). pub fn with_cloud(mut self, cloud: crate::cloud::CloudClient) -> Self { self.cloud = Some(std::sync::Arc::new(cloud)); self } /// Optional cloud-fallback client, if one was attached via /// [`Self::with_cloud`]. Extractors that handle antibot sites /// pass this into `cloud::smart_fetch_html`. pub fn cloud(&self) -> Option<&crate::cloud::CloudClient> { self.cloud.as_deref() } /// Fetch a URL with per-site rescue paths: Reddit URLs redirect to the /// `.json` API, and Akamai-style challenge responses trigger a homepage /// cookie warmup and a retry. Returns the same `FetchResult` shape as /// [`Self::fetch`] so every caller (CLI, MCP, OSS server, production /// server) benefits without shape churn. /// /// This is the method most callers want. Use plain [`Self::fetch`] only /// when you need literal no-rescue behavior (e.g. inside the rescue /// logic itself to avoid recursion). pub async fn fetch_smart(&self, url: &str) -> Result { // Reddit: fetch old.reddit.com for stable server-rendered HTML. // The JSON API is blocked; old.reddit.com works without JS or auth. let owned; let url = if crate::reddit::is_reddit_url(url) { owned = crate::reddit::to_old_reddit_url(url); owned.as_str() } else { url }; let resp = self.fetch(url).await?; // Akamai / bazadebezolkohpepadr challenge: visit the homepage to // collect warmup cookies (_abck, bm_sz, etc.), then retry. if is_challenge_html(&resp.html) && let Some(homepage) = extract_homepage(url) { debug!("challenge detected, warming cookies via {homepage}"); let _ = self.fetch(&homepage).await; if let Ok(retry) = self.fetch(url).await { return Ok(retry); } } Ok(resp) } /// Fetch a URL and return the raw HTML + response metadata. /// /// Automatically retries on transient failures (network errors, 5xx, 429) /// with exponential backoff: 0s, 1s (2 attempts total). No per-site /// rescue logic; use [`Self::fetch_smart`] for that. #[instrument(skip(self), fields(url = %url))] pub async fn fetch(&self, url: &str) -> Result { let delays = [Duration::ZERO, Duration::from_secs(1)]; let mut last_err = None; for (attempt, delay) in delays.iter().enumerate() { if attempt > 0 { tokio::time::sleep(*delay).await; } match self.fetch_once(url).await { Ok(result) => { if is_retryable_status(result.status) && attempt < delays.len() - 1 { warn!( url, status = result.status, attempt = attempt + 1, "retryable status, will retry" ); last_err = Some(FetchError::Build(format!("HTTP {}", result.status))); continue; } if attempt > 0 { debug!(url, attempt = attempt + 1, "retry succeeded"); } return Ok(result); } Err(e) => { if !is_retryable_error(&e) || attempt == delays.len() - 1 { return Err(e); } warn!( url, error = %e, attempt = attempt + 1, "transient error, will retry" ); last_err = Some(e); } } } Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into()))) } /// Single fetch attempt. async fn fetch_once(&self, url: &str) -> Result { self.fetch_once_with_headers(url, &[]).await } /// Single fetch attempt with optional per-request headers appended /// after the profile defaults. Used by extractors that need to /// satisfy site-specific headers (e.g. `x-ig-app-id` for Instagram's /// internal API). async fn fetch_once_with_headers( &self, url: &str, extra: &[(&str, &str)], ) -> Result { let parsed_url = crate::url_security::validate_public_http_url(url).await?; let url = parsed_url.as_str(); let start = Instant::now(); let client = self.pick_client(url); let mut req = client.get(url); for (k, v) in extra { req = req.header(*k, *v); } let resp = req.send().await?; let response = Response::from_wreq(resp).await?; response_to_result(response, start) } /// Fetch a URL with extra per-request headers appended after the /// browser-profile defaults. Same retry semantics as `fetch`. /// /// Use this when an upstream API requires a header the global /// `FetchConfig.headers` shouldn't carry to other hosts (Instagram's /// `x-ig-app-id`, GitHub's `Authorization` once we wire `GITHUB_TOKEN`, /// Reddit's compliant UA when we add OAuth, etc.). #[instrument(skip(self, extra), fields(url = %url, extra_count = extra.len()))] pub async fn fetch_with_headers( &self, url: &str, extra: &[(&str, &str)], ) -> Result { let delays = [Duration::ZERO, Duration::from_secs(1)]; let mut last_err = None; for (attempt, delay) in delays.iter().enumerate() { if attempt > 0 { tokio::time::sleep(*delay).await; } match self.fetch_once_with_headers(url, extra).await { Ok(result) => { if is_retryable_status(result.status) && attempt < delays.len() - 1 { warn!( url, status = result.status, attempt = attempt + 1, "retryable status, will retry" ); last_err = Some(FetchError::Build(format!("HTTP {}", result.status))); continue; } if attempt > 0 { debug!(url, attempt = attempt + 1, "retry succeeded"); } return Ok(result); } Err(e) => { if !is_retryable_error(&e) || attempt == delays.len() - 1 { return Err(e); } warn!( url, error = %e, attempt = attempt + 1, "transient error, will retry" ); last_err = Some(e); } } } Err(last_err.unwrap_or_else(|| FetchError::Build("all retries exhausted".into()))) } /// Fetch a URL then extract structured content. #[instrument(skip(self), fields(url = %url))] pub async fn fetch_and_extract( &self, url: &str, ) -> Result { self.fetch_and_extract_with_options(url, &webclaw_core::ExtractionOptions::default()) .await } /// Fetch a URL then extract structured content with custom extraction options. #[instrument(skip(self, options), fields(url = %url))] pub async fn fetch_and_extract_with_options( &self, url: &str, options: &webclaw_core::ExtractionOptions, ) -> Result { let parsed_url = crate::url_security::validate_public_http_url(url).await?; let url = parsed_url.as_str(); // Reddit: rewrite to old.reddit.com for stable server-rendered HTML. // webclaw-core's Reddit fast path then parses the thread structure. let reddit_owned; let url = if crate::reddit::is_reddit_url(url) { reddit_owned = crate::reddit::to_old_reddit_url(url); debug!("reddit: rewriting to {reddit_owned}"); reddit_owned.as_str() } else { url }; let start = Instant::now(); let client = self.pick_client(url); let resp = client.get(url).send().await?; let mut response = Response::from_wreq(resp).await?; // Cookie warmup: if we get a challenge page, visit the homepage first // to collect Akamai cookies (_abck, bm_sz, etc.), then retry. if is_challenge_response(&response) && let Some(homepage) = extract_homepage(url) { debug!("challenge detected, warming cookies via {homepage}"); let _ = self.fetch(&homepage).await; let resp = client.get(url).send().await?; response = Response::from_wreq(resp).await?; debug!("retried after cookie warmup: status={}", response.status()); } let status = response.status(); let final_url = response.url().to_string(); let headers = response.headers().clone(); let is_pdf = is_pdf_content_type(&headers); if is_pdf { debug!(status, "detected PDF response, using pdf extraction"); let bytes = response.body(); let elapsed = start.elapsed(); debug!( status, bytes = bytes.len(), elapsed_ms = %elapsed.as_millis(), "PDF fetch complete" ); let pdf_result = webclaw_pdf::extract_pdf(bytes, self.pdf_mode.clone())?; Ok(pdf_to_extraction_result(&pdf_result, &final_url)) } else if let Some(doc_type) = crate::document::is_document_content_type(&headers, &final_url) { debug!(status, doc_type = ?doc_type, "detected document response, extracting"); let bytes = response.body(); let elapsed = start.elapsed(); debug!( status, bytes = bytes.len(), elapsed_ms = %elapsed.as_millis(), "document fetch complete" ); let mut result = crate::document::extract_document(bytes, doc_type)?; result.metadata.url = Some(final_url); Ok(result) } else { let html = response.into_text(); let elapsed = start.elapsed(); debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete"); // LinkedIn: extract from embedded JSON blobs if crate::linkedin::is_linkedin_post(&final_url) { if let Some(result) = crate::linkedin::extract_linkedin_post(&html, &final_url) { debug!("linkedin extraction succeeded"); return Ok(result); } debug!("linkedin extraction failed, falling back to standard"); } let extraction = webclaw_core::extract_with_options(&html, Some(&final_url), options)?; Ok(extraction) } } /// Fetch multiple URLs concurrently with bounded parallelism. pub async fn fetch_batch( self: &Arc, urls: &[&str], concurrency: usize, ) -> Vec { let semaphore = Arc::new(Semaphore::new(concurrency)); let mut handles = Vec::with_capacity(urls.len()); for (idx, url) in urls.iter().enumerate() { let permit = Arc::clone(&semaphore); let client = Arc::clone(self); let url = url.to_string(); handles.push(tokio::spawn(async move { // Don't panic if the semaphore has been closed under us // (adversarial runtime state or shutdown race). Surface a // typed error instead so the caller sees one failed URL in // the batch instead of a silently-dropped task. let result = match permit.acquire().await { Ok(_permit) => client.fetch(&url).await, Err(_) => Err(FetchError::Build("semaphore closed before acquire".into())), }; (idx, BatchResult { url, result }) })); } collect_ordered(handles, urls.len()).await } /// Fetch and extract multiple URLs concurrently with bounded parallelism. pub async fn fetch_and_extract_batch( self: &Arc, urls: &[&str], concurrency: usize, ) -> Vec { self.fetch_and_extract_batch_with_options( urls, concurrency, &webclaw_core::ExtractionOptions::default(), ) .await } /// Fetch and extract multiple URLs concurrently with custom extraction options. pub async fn fetch_and_extract_batch_with_options( self: &Arc, urls: &[&str], concurrency: usize, options: &webclaw_core::ExtractionOptions, ) -> Vec { let semaphore = Arc::new(Semaphore::new(concurrency)); let mut handles = Vec::with_capacity(urls.len()); for (idx, url) in urls.iter().enumerate() { let permit = Arc::clone(&semaphore); let client = Arc::clone(self); let url = url.to_string(); let opts = options.clone(); handles.push(tokio::spawn(async move { let result = match permit.acquire().await { Ok(_permit) => client.fetch_and_extract_with_options(&url, &opts).await, Err(_) => Err(FetchError::Build("semaphore closed before acquire".into())), }; (idx, BatchExtractResult { url, result }) })); } collect_ordered(handles, urls.len()).await } /// Returns the number of proxies in the rotation pool, or 0 if static mode. pub fn proxy_pool_size(&self) -> usize { match &self.pool { ClientPool::Static { .. } => 0, ClientPool::Rotating { clients } => clients.len(), } } /// Pick a client from the pool for a given URL. fn pick_client(&self, url: &str) -> &wreq::Client { match &self.pool { ClientPool::Static { clients, random } => { if *random { let host = extract_host(url); pick_for_host(clients, &host) } else { &clients[0] } } ClientPool::Rotating { clients } => pick_random(clients), } } } // --------------------------------------------------------------------------- // Fetcher trait implementation // // Vertical extractors consume the [`crate::fetcher::Fetcher`] trait // rather than `FetchClient` directly, which is what lets the production // API server swap in a tls-sidecar-backed implementation without // pulling wreq into its dependency graph. For everyone else (CLI, MCP, // self-hosted OSS server) this impl means "pass the FetchClient you // already have; nothing changes". // --------------------------------------------------------------------------- #[async_trait::async_trait] impl crate::fetcher::Fetcher for FetchClient { async fn fetch(&self, url: &str) -> Result { FetchClient::fetch(self, url).await } async fn fetch_with_headers( &self, url: &str, headers: &[(&str, &str)], ) -> Result { FetchClient::fetch_with_headers(self, url, headers).await } fn cloud(&self) -> Option<&crate::cloud::CloudClient> { FetchClient::cloud(self) } } /// Collect the browser variants to use based on the browser profile. fn collect_variants(profile: &BrowserProfile) -> Vec { match profile { BrowserProfile::Random => browser::all_variants(), BrowserProfile::Chrome => vec![browser::latest_chrome()], BrowserProfile::Firefox => vec![browser::latest_firefox()], BrowserProfile::SafariIos => vec![BrowserVariant::SafariIos26], } } /// Convert a buffered Response into a FetchResult. fn response_to_result(response: Response, start: Instant) -> Result { let status = response.status(); let final_url = response.url().to_string(); let headers = response.headers().clone(); let html = response.into_text(); let elapsed = start.elapsed(); debug!(status, elapsed_ms = %elapsed.as_millis(), "fetch complete"); Ok(FetchResult { html, status, url: final_url, headers, elapsed, }) } /// Extract the host from a URL, returning empty string on parse failure. fn extract_host(url: &str) -> String { url::Url::parse(url) .ok() .and_then(|u| u.host_str().map(String::from)) .unwrap_or_default() } /// Pick a client deterministically based on a host string. /// Same host always gets the same client, enabling HTTP/2 connection reuse. fn pick_for_host<'a>(clients: &'a [wreq::Client], host: &str) -> &'a wreq::Client { let mut hasher = std::collections::hash_map::DefaultHasher::new(); host.hash(&mut hasher); let idx = (hasher.finish() as usize) % clients.len(); &clients[idx] } /// Pick a random client from the pool for per-request rotation. fn pick_random(clients: &[wreq::Client]) -> &wreq::Client { use rand::Rng; let idx = rand::thread_rng().gen_range(0..clients.len()); &clients[idx] } /// Status codes worth retrying: server errors + rate limiting. fn is_retryable_status(status: u16) -> bool { status == 429 || status == 502 || status == 503 || status == 504 || status == 520 || status == 521 || status == 522 || status == 523 || status == 524 } /// Errors worth retrying: network/connection failures (not client errors). fn is_retryable_error(err: &FetchError) -> bool { matches!(err, FetchError::Request(_) | FetchError::BodyDecode(_)) } fn is_pdf_content_type(headers: &http::HeaderMap) -> bool { headers .get("content-type") .and_then(|ct| ct.to_str().ok()) .map(|ct| { let mime = ct.split(';').next().unwrap_or("").trim(); mime.eq_ignore_ascii_case("application/pdf") }) .unwrap_or(false) } /// Detect if a response looks like a bot protection challenge page. fn is_challenge_response(response: &Response) -> bool { let body_len = response.body().len(); if body_len > 15_000 || body_len == 0 { return false; } is_challenge_html(response.text().as_ref()) } /// Same as `is_challenge_response`, operating on a body string directly /// so callers holding a `FetchResult` can reuse the heuristic. fn is_challenge_html(html: &str) -> bool { let len = html.len(); if len > 15_000 || len == 0 { return false; } let lower = html.to_lowercase(); if lower.contains("challenge page") { return true; } if lower.contains("bazadebezolkohpepadr") && len < 5_000 { return true; } false } /// Extract the homepage URL (scheme + host[:port]) from a full URL. fn extract_homepage(url: &str) -> Option { url::Url::parse(url).ok().map(|u| { let host = u.host_str().unwrap_or(""); // `port()` is `Some` only for a non-default port; include it so a // host like example.com:8443 is warmed on the right port. match u.port() { Some(port) => format!("{}://{}:{}/", u.scheme(), host, port), None => format!("{}://{}/", u.scheme(), host), } }) } /// Convert a webclaw-pdf PdfResult into a webclaw-core ExtractionResult. fn pdf_to_extraction_result( pdf: &webclaw_pdf::PdfResult, url: &str, ) -> webclaw_core::ExtractionResult { let markdown = webclaw_pdf::to_markdown(pdf); let word_count = markdown.split_whitespace().count(); webclaw_core::ExtractionResult { metadata: webclaw_core::Metadata { title: pdf.metadata.title.clone(), description: pdf.metadata.subject.clone(), author: pdf.metadata.author.clone(), published_date: None, language: None, url: Some(url.to_string()), site_name: None, image: None, favicon: None, word_count, }, content: webclaw_core::Content { markdown, plain_text: pdf.text.clone(), links: Vec::new(), images: Vec::new(), code_blocks: Vec::new(), raw_html: None, }, domain_data: None, structured_data: vec![], } } /// Collect spawned tasks and reorder results to match input order. async fn collect_ordered( handles: Vec>, len: usize, ) -> Vec { let mut slots: Vec> = (0..len).map(|_| None).collect(); for handle in handles { match handle.await { Ok((idx, result)) => { slots[idx] = Some(result); } Err(e) => { warn!(error = %e, "batch task panicked"); } } } slots.into_iter().flatten().collect() } #[cfg(test)] mod tests { use super::*; #[test] fn test_batch_result_struct() { let ok = BatchResult { url: "https://example.com".to_string(), result: Ok(FetchResult { html: "".to_string(), status: 200, url: "https://example.com".to_string(), headers: http::HeaderMap::new(), elapsed: Duration::from_millis(42), }), }; assert_eq!(ok.url, "https://example.com"); assert!(ok.result.is_ok()); assert_eq!(ok.result.unwrap().status, 200); let err = BatchResult { url: "https://bad.example".to_string(), result: Err(FetchError::InvalidUrl("bad url".into())), }; assert!(err.result.is_err()); } #[test] fn body_ceiling_allows_under_cap() { assert!(check_body_ceiling(0, 1024).is_ok()); assert!(check_body_ceiling(MAX_BODY_BYTES as usize - 1, 1).is_ok()); } #[test] fn body_ceiling_rejects_at_and_over_cap() { // Exactly at the cap is allowed; one byte over is rejected. assert!(check_body_ceiling(MAX_BODY_BYTES as usize, 1).is_err()); // A small buffer plus a huge inflated chunk (decompression bomb) // is caught on the very first oversized chunk. let err = check_body_ceiling(16, 64 * 1024 * 1024).unwrap_err(); assert!(matches!(err, FetchError::BodyDecode(_))); } #[test] fn body_ceiling_saturates_on_overflow() { // usize::MAX chunk must not wrap the running sum to a small value. assert!(check_body_ceiling(usize::MAX, usize::MAX).is_err()); } #[test] fn test_batch_extract_result_struct() { let err = BatchExtractResult { url: "https://example.com".to_string(), result: Err(FetchError::BodyDecode("timeout".into())), }; assert_eq!(err.url, "https://example.com"); assert!(err.result.is_err()); } #[tokio::test] async fn test_batch_preserves_order() { let handles: Vec> = vec![ tokio::spawn(async { (2, "c".to_string()) }), tokio::spawn(async { (0, "a".to_string()) }), tokio::spawn(async { (1, "b".to_string()) }), ]; let results = collect_ordered(handles, 3).await; assert_eq!(results, vec!["a", "b", "c"]); } #[tokio::test] async fn test_collect_ordered_handles_gaps() { let handles: Vec> = vec![ tokio::spawn(async { (0, "first".to_string()) }), tokio::spawn(async { (2, "third".to_string()) }), ]; let results = collect_ordered(handles, 3).await; assert_eq!(results.len(), 2); assert_eq!(results[0], "first"); assert_eq!(results[1], "third"); } #[test] fn test_is_pdf_content_type() { let mut headers = http::HeaderMap::new(); headers.insert("content-type", "application/pdf".parse().unwrap()); assert!(is_pdf_content_type(&headers)); headers.insert( "content-type", "application/pdf; charset=utf-8".parse().unwrap(), ); assert!(is_pdf_content_type(&headers)); headers.insert("content-type", "Application/PDF".parse().unwrap()); assert!(is_pdf_content_type(&headers)); headers.insert("content-type", "text/html".parse().unwrap()); assert!(!is_pdf_content_type(&headers)); let empty = http::HeaderMap::new(); assert!(!is_pdf_content_type(&empty)); } #[test] fn test_pdf_to_extraction_result() { let pdf = webclaw_pdf::PdfResult { text: "Hello from PDF.".into(), page_count: 2, metadata: webclaw_pdf::PdfMetadata { title: Some("My Doc".into()), author: Some("Author".into()), subject: Some("Testing".into()), creator: None, }, }; let result = pdf_to_extraction_result(&pdf, "https://example.com/doc.pdf"); assert_eq!(result.metadata.title.as_deref(), Some("My Doc")); assert_eq!(result.metadata.author.as_deref(), Some("Author")); assert_eq!(result.metadata.description.as_deref(), Some("Testing")); assert_eq!( result.metadata.url.as_deref(), Some("https://example.com/doc.pdf") ); assert!(result.content.markdown.contains("# My Doc")); assert!(result.content.markdown.contains("Hello from PDF.")); assert_eq!(result.content.plain_text, "Hello from PDF."); assert!(result.content.links.is_empty()); assert!(result.domain_data.is_none()); assert!(result.metadata.word_count > 0); } #[test] fn test_static_pool_no_proxy() { let config = FetchConfig::default(); let client = FetchClient::new(config).unwrap(); assert_eq!(client.proxy_pool_size(), 0); } #[test] fn test_rotating_pool_prebuilds_clients() { let config = FetchConfig { proxy_pool: vec![ "http://proxy1:8080".into(), "http://proxy2:8080".into(), "http://proxy3:8080".into(), ], ..Default::default() }; let client = FetchClient::new(config).unwrap(); assert_eq!(client.proxy_pool_size(), 3); } #[test] fn test_pick_for_host_deterministic() { let config = FetchConfig { browser: BrowserProfile::Random, ..Default::default() }; let client = FetchClient::new(config).unwrap(); let clients = match &client.pool { ClientPool::Static { clients, .. } => clients, ClientPool::Rotating { clients } => clients, }; let a1 = pick_for_host(clients, "example.com") as *const _; let a2 = pick_for_host(clients, "example.com") as *const _; let a3 = pick_for_host(clients, "example.com") as *const _; assert_eq!(a1, a2); assert_eq!(a2, a3); } #[test] fn test_pick_for_host_distributes() { let config = FetchConfig { proxy_pool: (0..10).map(|i| format!("http://proxy{i}:8080")).collect(), ..Default::default() }; let client = FetchClient::new(config).unwrap(); let clients = match &client.pool { ClientPool::Static { clients, .. } | ClientPool::Rotating { clients } => clients, }; let hosts = [ "example.com", "google.com", "github.com", "rust-lang.org", "crates.io", ]; let indices: Vec = hosts .iter() .map(|h| { let ptr = pick_for_host(clients, h) as *const _; clients.iter().position(|c| std::ptr::eq(c, ptr)).unwrap() }) .collect(); let unique: std::collections::HashSet<_> = indices.iter().collect(); assert!( unique.len() >= 2, "expected host distribution across clients, got indices: {indices:?}" ); } #[test] fn test_extract_host() { assert_eq!(extract_host("https://example.com/path"), "example.com"); assert_eq!( extract_host("https://sub.example.com:8080/foo"), "sub.example.com" ); assert_eq!(extract_host("not-a-url"), ""); } #[test] fn test_default_config_has_empty_proxy_pool() { let config = FetchConfig::default(); assert!(config.proxy_pool.is_empty()); assert!(config.proxy.is_none()); } }