diff --git a/.github/banner.png b/.github/banner.png index 07a6673..968277f 100644 Binary files a/.github/banner.png and b/.github/banner.png differ diff --git a/CHANGELOG.md b/CHANGELOG.md index 856cc11..97aac8b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,21 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.6.7] — 2026-06-09 + +### Changed +- Updated the HTTP/TLS engine (wreq 6.0.0-rc.29, wreq-util 3.0.0-rc.12). This pulls in upstream robustness fixes: no more panic on responses with non-UTF8 header values, a fix for short reads when decoding large compressed bodies, and the TCP nodelay setting is restored. Browser TLS fingerprints are unchanged. + +--- + +## [0.6.6] — 2026-06-09 + +### Added +- Slow fetches now print a progress line to stderr every 10 seconds (`# webclaw: still fetching (Ns)`) so a long request no longer looks like the CLI hung. Fast fetches stay silent and stdout is untouched. +- New `--url-encoded` flag plus a warning when a URL looks like the shell split it on `&` or `?`. The warning suggests quoting the URL; pass `--url-encoded` to silence it when the URL is intentional. + +--- + ## [0.6.5] — 2026-06-04 ### Changed diff --git a/Cargo.lock b/Cargo.lock index 78e7e77..3196dd9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -28,18 +28,6 @@ dependencies = [ "cpufeatures", ] -[[package]] -name = "ahash" -version = "0.8.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" -dependencies = [ - "cfg-if", - "once_cell", - "version_check", - "zerocopy", -] - [[package]] name = "aho-corasick" version = "1.1.4" @@ -64,6 +52,12 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -272,9 +266,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "2.11.0" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" [[package]] name = "block-buffer" @@ -285,31 +279,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "boring-sys2" -version = "5.0.0-alpha.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "455d79965f5155dcc88a7abce112c3590883889131b799beda10bf9a813ed669" -dependencies = [ - "bindgen", - "cmake", - "fs_extra", - "fslock", -] - -[[package]] -name = "boring2" -version = "5.0.0-alpha.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "183ccc3854411c035410dcdbffafca62084f3a6c33f013c77e83c025d2a08a28" -dependencies = [ - "bitflags", - "boring-sys2", - "foreign-types", - "libc", - "openssl-macros", -] - [[package]] name = "brotli" version = "8.0.2" @@ -331,6 +300,31 @@ dependencies = [ "alloc-stdlib", ] +[[package]] +name = "btls" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c5e60b8c8d282c86360cab651ded04ab0335a7b5390c8d34145cbeab8cacf5f" +dependencies = [ + "bitflags", + "btls-sys", + "foreign-types", + "libc", + "openssl-macros", +] + +[[package]] +name = "btls-sys" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b1b8638a2e1c38a5ae4efa90ae57e643baec35a30d03fc5b399b893adc4954b" +dependencies = [ + "bindgen", + "cmake", + "fs_extra", + "fslock", +] + [[package]] name = "bumpalo" version = "3.20.2" @@ -865,6 +859,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "foreign-types" version = "0.5.0" @@ -1089,19 +1089,13 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" -[[package]] -name = "hashbrown" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" - [[package]] name = "hashbrown" version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "foldhash", + "foldhash 0.1.5", ] [[package]] @@ -1110,6 +1104,17 @@ version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] + [[package]] name = "heck" version = "0.5.0" @@ -1172,9 +1177,9 @@ dependencies = [ [[package]] name = "http2" -version = "0.5.15" +version = "0.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c45c6490693ee8a8d0d95fdbdf76fead9fb87548f7894137259a7c6d22821948" +checksum = "569ef7a780e853c4e1768f58a3c8168193b82cdcbab66638a0b1c6583ec5995e" dependencies = [ "atomic-waker", "bytes", @@ -1183,7 +1188,6 @@ dependencies = [ "futures-sink", "http", "indexmap", - "parking_lot", "slab", "smallvec", "tokio", @@ -1495,9 +1499,9 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" [[package]] name = "libc" -version = "0.2.183" +version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" [[package]] name = "libloading" @@ -1563,6 +1567,15 @@ dependencies = [ "weezl", ] +[[package]] +name = "lru" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a860605968fce16869fd239cf4237a82f3ac470723415db603b0e8b6c8d4fb9" +dependencies = [ + "hashbrown 0.17.1", +] + [[package]] name = "lru-slab" version = "0.1.2" @@ -2375,17 +2388,6 @@ dependencies = [ "syn", ] -[[package]] -name = "schnellru" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "356285bbf17bea63d9e52e96bd18f039672ac92b55b8cb997d6162a2a37d1649" -dependencies = [ - "ahash", - "cfg-if", - "hashbrown 0.13.2", -] - [[package]] name = "scopeguard" version = "1.2.0" @@ -2779,9 +2781,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.50.0" +version = "1.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" +checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" dependencies = [ "bytes", "libc", @@ -2795,20 +2797,20 @@ dependencies = [ ] [[package]] -name = "tokio-boring2" -version = "5.0.0-alpha.13" +name = "tokio-btls" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f81df1210d791f31d72d840de8fbd80b9c3cb324956523048b1413e2bd55756" +checksum = "2e1fd638ec35427faf3b8f412e0fdd6fae76591d79dba40f38fa667d22bc44dd" dependencies = [ - "boring2", + "btls", "tokio", ] [[package]] name = "tokio-macros" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", @@ -3219,7 +3221,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.6.5" +version = "0.6.7" dependencies = [ "clap", "dotenvy", @@ -3240,7 +3242,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.6.5" +version = "0.6.7" dependencies = [ "ego-tree", "once_cell", @@ -3258,11 +3260,12 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.6.5" +version = "0.6.7" dependencies = [ "async-trait", "bytes", "calamine", + "futures-util", "http", "quick-xml 0.37.5", "rand 0.8.5", @@ -3284,7 +3287,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.6.5" +version = "0.6.7" dependencies = [ "async-trait", "reqwest", @@ -3297,7 +3300,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.6.5" +version = "0.6.7" dependencies = [ "dirs", "dotenvy", @@ -3317,7 +3320,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.6.5" +version = "0.6.7" dependencies = [ "pdf-extract", "thiserror", @@ -3326,7 +3329,7 @@ dependencies = [ [[package]] name = "webclaw-server" -version = "0.6.5" +version = "0.6.7" dependencies = [ "anyhow", "axum", @@ -3347,9 +3350,9 @@ dependencies = [ [[package]] name = "webpki-root-certs" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca" +checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c" dependencies = [ "rustls-pki-types", ] @@ -3696,17 +3699,14 @@ dependencies = [ [[package]] name = "wreq" -version = "6.0.0-rc.28" +version = "6.0.0-rc.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f79937f6c4df65b3f6f78715b9de2977afe9ee3b3436483c7949a24511e25935" +checksum = "3f0eba5f5814a94e5f1a99156f187133464e525b66bdbc69a9627d46530af2e1" dependencies = [ - "ahash", - "boring2", - "brotli", + "btls", + "btls-sys", "bytes", "cookie", - "flate2", - "futures-channel", "futures-util", "http", "http-body", @@ -3715,29 +3715,64 @@ dependencies = [ "httparse", "ipnet", "libc", + "lru", "percent-encoding", "pin-project-lite", - "schnellru", - "smallvec", "socket2", + "sync_wrapper", "tokio", - "tokio-boring2", + "tokio-btls", + "tokio-util", "tower", "tower-http", "url", - "want", "webpki-root-certs", - "zstd", + "wreq-proto", + "wreq-rt", +] + +[[package]] +name = "wreq-proto" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a43942f024bb303f1042c9aa3c87fa1d9149f507c65db6e5220a11ccdb207387" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http", + "http-body", + "http2", + "httparse", + "pin-project-lite", + "smallvec", + "tokio", + "tokio-util", + "want", +] + +[[package]] +name = "wreq-rt" +version = "0.2.2-rc.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99e9bce67a3fa3dd3f1503f066d86661c9caf399a763d3bd184da7afaf886c8b" +dependencies = [ + "pin-project-lite", + "tokio", + "wreq-proto", ] [[package]] name = "wreq-util" -version = "3.0.0-rc.10" +version = "3.0.0-rc.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c6bbe24d28beb9ceb58b514bd6a613c759d3b706f768b9d2950d5d35b543c04" +checksum = "baa5d2ab72139256916ca352a3d05c53d74e1dd360052eb5ba7691033c417c65" dependencies = [ + "brotli", + "flate2", "typed-builder", "wreq", + "zstd", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 124c620..85ab113 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.6.5" +version = "0.6.7" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index 1348824..37a04ff 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -166,6 +166,14 @@ struct Cli { #[arg(long)] urls_file: Option, + /// Assert that the URL has been handled for shell escaping. Suppresses + /// the URL-truncation stderr warning. Use when the URL is intentionally + /// passed with an empty/keyless query (e.g. legacy CGI) or when a + /// trailing `&` is genuinely part of the URL. The URL is fetched as-is + /// (no extra normalization beyond the standard scheme prepend). + #[arg(long)] + url_encoded: bool, + /// Output format (markdown, json, text, llm, html) #[arg(short, long, default_value = "markdown")] format: OutputFormat, @@ -591,6 +599,31 @@ fn normalize_url(url: &str) -> String { } } +/// M14: detect URLs that look truncated by the shell (e.g. an unquoted URL +/// that the shell split on `&` or `?`). Returns `true` when: +/// - the URL ends with `&` (a trailing param separator suggests the next +/// param was lopped off), OR +/// - the URL contains `?` but no `=` after it (a query with bare keys is +/// rare; usually a real query has at least one `=`). +/// +/// Informational only — caller decides whether to warn / abort. This is a +/// heuristic; legitimate URLs with bare-key queries will trigger a false +/// positive (suppressible via `--url-encoded`). +fn looks_truncated(url: &str) -> bool { + let trimmed = url.trim(); + if trimmed.ends_with('&') { + return true; + } + if let Some((_before, after_q)) = trimmed.split_once('?') { + // Trim a trailing fragment so `?#section` etc. doesn't mask the check. + let query_part = after_q.split('#').next().unwrap_or(after_q); + if !query_part.contains('=') { + return true; + } + } + false +} + /// Derive a filename from a URL for `--output-dir`. /// /// Strips the scheme/host, maps the path to a filesystem path, and appends @@ -826,6 +859,14 @@ async fn fetch_and_extract(cli: &Cli) -> Result { .urls .first() .ok_or("no input provided -- pass a URL, --file, or --stdin")?; + // M14: warn when the URL looks like the shell split it on `&` or `?`. + // Informational only — fetch still proceeds. Suppressed by --url-encoded, + // which asserts the caller has handled escaping intentionally. + if !cli.url_encoded && looks_truncated(raw_url) { + eprintln!( + "# webclaw: warning: URL looks truncated (ends with '&' or '?'); did the shell split it? Quote the URL or use --url-encoded." + ); + } let url = normalize_url(raw_url); let url = url.as_str(); @@ -859,8 +900,11 @@ async fn fetch_and_extract(cli: &Cli) -> Result { let client = FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; let options = build_extraction_options(cli); - let result = client - .fetch_and_extract_with_options(url, &options) + // M13: wrap with periodic stderr progress emitter. Fast fetches see + // zero emissions (timer never fires in <10s); slow fetches get a + // line every 10s of elapsed time so the CLI doesn't appear hung. + let fetch_fut = client.fetch_and_extract_with_options(url, &options); + let result = webclaw_fetch::with_progress(url, fetch_fut) .await .map_err(|e| format!("fetch error: {e}"))?; @@ -2879,6 +2923,61 @@ mod tests { let _ = std::fs::remove_dir_all(&dir); } + // M14: URL truncation heuristic tests. + #[test] + fn looks_truncated_fires_on_trailing_ampersand() { + // The most common shell-split shape: `?a=1&` lost the `b=2`. + assert!(looks_truncated("https://example.com/?a=1&")); + assert!(looks_truncated("https://example.com/path?key=val&")); + } + + #[test] + fn looks_truncated_fires_on_query_with_no_equals() { + // `?foo` with no `=` is a strong signal the shell ate the `=value`. + assert!(looks_truncated("https://example.com/?foo")); + // Bare `?` (empty query) also looks like the shell ate the whole pair. + assert!(looks_truncated("https://example.com/?")); + // Same with a fragment after — strip fragment before checking. + assert!(looks_truncated("https://example.com/?foo#section")); + } + + #[test] + fn looks_truncated_silent_on_clean_url() { + // Normal URLs (no query, or query with at least one `=`) are clean. + assert!(!looks_truncated("https://example.com/")); + assert!(!looks_truncated("https://example.com/path/to/page")); + assert!(!looks_truncated("https://example.com/?a=1")); + assert!(!looks_truncated("https://example.com/?a=1&b=2")); + assert!(!looks_truncated( + "https://example.com/?a=1&b=2&c=hello%20world" + )); + // Hash anchors without a query are clean. + assert!(!looks_truncated("https://example.com/page#section")); + } + + #[test] + fn looks_truncated_silent_with_url_encoded_assertion_modeled_via_skip() { + // The --url-encoded flag suppresses the warning at the call site + // (main.rs gates the eprintln! behind `if !cli.url_encoded`). + // This test models the gate logic directly: when --url-encoded is set, + // the warning branch is never entered, even on a truncated-looking URL. + let url = "https://example.com/?a=1&"; + let url_encoded_flag = true; + let should_warn = !url_encoded_flag && looks_truncated(url); + assert!( + !should_warn, + "--url-encoded must suppress the warning even on URL ending with &" + ); + + // Sanity: same URL without --url-encoded does warn. + let url_encoded_flag = false; + let should_warn = !url_encoded_flag && looks_truncated(url); + assert!( + should_warn, + "without --url-encoded, the warning should fire on URL ending with &" + ); + } + #[test] fn research_slug_truncation_is_char_safe() { // Multibyte query: byte-slicing at 50 would panic mid-codepoint. diff --git a/crates/webclaw-fetch/Cargo.toml b/crates/webclaw-fetch/Cargo.toml index de3036b..4671dc1 100644 --- a/crates/webclaw-fetch/Cargo.toml +++ b/crates/webclaw-fetch/Cargo.toml @@ -14,13 +14,16 @@ tracing = { workspace = true } tokio = { workspace = true } async-trait = "0.1" # Pinned to exact pre-release versions: wreq/wreq-util are release candidates -# with no semver stability between rc.N builds (rc.29 broke the TLS + Response -# API). An exact pin keeps `cargo build`, `cargo install` (which ignores -# Cargo.lock), and the release workflow all on the version that compiles. -wreq = { version = "=6.0.0-rc.28", features = ["cookies", "gzip", "brotli", "zstd", "deflate"] } -wreq-util = "=3.0.0-rc.10" +# with no semver stability between rc.N builds. An exact pin keeps `cargo build`, +# `cargo install` (which ignores Cargo.lock), and the release workflow all on the +# version that compiles. +wreq = { version = "=6.0.0-rc.29", features = ["cookies", "gzip", "brotli", "zstd", "deflate", "stream"] } +wreq-util = "=3.0.0-rc.12" http = "1" bytes = "1" +# Stream adapter for `wreq::Response::bytes_stream()` (wreq 6.0.0-rc.29 dropped +# `Response::chunk()`); used to buffer bodies under the running size ceiling. +futures-util = "0.3" url = "2" rand = "0.8" quick-xml = { version = "0.37", features = ["serde"] } diff --git a/crates/webclaw-fetch/src/client.rs b/crates/webclaw-fetch/src/client.rs index 2bfd8c5..0724cec 100644 --- a/crates/webclaw-fetch/src/client.rs +++ b/crates/webclaw-fetch/src/client.rs @@ -12,6 +12,7 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; use std::time::{Duration, Instant}; +use futures_util::StreamExt; use rand::seq::SliceRandom; use tokio::sync::Semaphore; use tracing::{debug, instrument, warn}; @@ -118,7 +119,7 @@ impl Response { /// negotiated), so a tiny compressed payload that inflates to /// gigabytes is aborted as soon as the accumulated size crosses the /// cap — it never gets fully buffered in memory. - async fn from_wreq(mut resp: wreq::Response) -> Result { + async fn from_wreq(resp: wreq::Response) -> Result { if let Some(len) = resp.content_length() && len > MAX_BODY_BYTES { @@ -130,12 +131,13 @@ impl Response { let url = resp.uri().to_string(); let headers = resp.headers().clone(); + // wreq 6.0.0-rc.29 dropped `Response::chunk()`. Stream post-decompression + // bytes via `bytes_stream()` and keep enforcing the running ceiling so a + // compression bomb is aborted before it is fully buffered in memory. let mut buf = bytes::BytesMut::new(); - while let Some(chunk) = resp - .chunk() - .await - .map_err(|e| FetchError::BodyDecode(e.to_string()))? - { + let mut stream = resp.bytes_stream(); + while let Some(chunk) = stream.next().await { + let chunk = chunk.map_err(|e| FetchError::BodyDecode(e.to_string()))?; check_body_ceiling(buf.len(), chunk.len())?; buf.extend_from_slice(&chunk); } diff --git a/crates/webclaw-fetch/src/lib.rs b/crates/webclaw-fetch/src/lib.rs index 029a7b6..b859955 100644 --- a/crates/webclaw-fetch/src/lib.rs +++ b/crates/webclaw-fetch/src/lib.rs @@ -11,6 +11,7 @@ pub mod extractors; pub mod fetcher; pub mod linkedin; pub mod locale; +pub mod progress; pub mod proxy; pub mod reddit; pub mod sitemap; @@ -24,6 +25,7 @@ pub use error::FetchError; pub use fetcher::Fetcher; pub use http::HeaderMap; pub use locale::{accept_language_for_tld, accept_language_for_url}; +pub use progress::{PROGRESS_INTERVAL, with_progress}; pub use proxy::{parse_proxy_file, parse_proxy_line}; pub use sitemap::SitemapEntry; pub use webclaw_pdf::PdfMode; diff --git a/crates/webclaw-fetch/src/progress.rs b/crates/webclaw-fetch/src/progress.rs new file mode 100644 index 0000000..75931dd --- /dev/null +++ b/crates/webclaw-fetch/src/progress.rs @@ -0,0 +1,293 @@ +//! Periodic stderr progress line emitter for slow fetches (M13). +//! +//! Wraps any async fetch future with a `tokio::select!` against a +//! `tokio::time::interval`. Every `PROGRESS_INTERVAL` (default 10s) of +//! elapsed time, emits one line to STDERR of the form: +//! +//! ```text +//! # webclaw: still fetching (Ns) +//! ``` +//! +//! Fetches completing in under `PROGRESS_INTERVAL` emit zero lines (the +//! timer never fires). Stdout is untouched. +//! +//! The URL is truncated to at most 80 chars (head + `...` + tail) so +//! pathological query strings don't blow up the stderr line. Truncation +//! is char-boundary safe (operates on `chars`, not bytes). + +use std::future::Future; +use std::time::Duration; + +use tokio::time::{Instant, MissedTickBehavior, interval}; + +/// Default progress emission interval. The first tick fires at +10s +/// elapsed; subsequent ticks at +20s, +30s, etc. +pub const PROGRESS_INTERVAL: Duration = Duration::from_secs(10); + +/// Maximum URL length in the progress line. Longer URLs are truncated +/// `head...tail` style. +const MAX_URL_LEN: usize = 80; + +/// Wrap a fetch future with the default 10s progress emitter. Writes +/// progress lines to STDERR via `eprintln!`. Returns the inner future's +/// result unchanged. +pub async fn with_progress(url: &str, future: F) -> T +where + F: Future, +{ + with_progress_writer(url, future, PROGRESS_INTERVAL, |s| eprintln!("{s}")).await +} + +/// Test-friendly variant of [`with_progress`]: caller supplies the tick +/// interval (so tests can use a 50ms period instead of 10s) and a +/// writer closure (so tests can capture emitted lines without touching +/// real stderr). +/// +/// Production code uses [`with_progress`] which delegates here with +/// [`PROGRESS_INTERVAL`] and an `eprintln!` writer. +pub async fn with_progress_writer( + url: &str, + future: F, + period: Duration, + mut writer: W, +) -> T +where + F: Future, + W: FnMut(String), +{ + let start = Instant::now(); + let mut ticker = interval(period); + // First tick of `tokio::time::interval(period)` fires *immediately* + // (at construction time). We don't want a t=0 emit — consume that + // first tick before entering the select loop. Subsequent ticks fire + // at `start + period`, `start + 2*period`, ... + ticker.set_missed_tick_behavior(MissedTickBehavior::Skip); + ticker.tick().await; + + tokio::pin!(future); + + loop { + tokio::select! { + // Bias toward the future — if both are ready (rare), prefer + // returning the result over emitting a final tick. + biased; + result = &mut future => { + return result; + } + _ = ticker.tick() => { + let elapsed = start.elapsed(); + writer(format_progress_line(url, elapsed)); + } + } + } +} + +/// Build the progress line: `# webclaw: still fetching (Ns)`. +/// URL is truncated via [`truncate_url`] to [`MAX_URL_LEN`] chars. +/// Elapsed is rounded to whole seconds (10, 20, 30, ...). +pub(crate) fn format_progress_line(url: &str, elapsed: Duration) -> String { + let truncated = truncate_url(url, MAX_URL_LEN); + let secs = elapsed.as_secs(); + format!("# webclaw: still fetching {truncated} ({secs}s)") +} + +/// Truncate `url` to at most `max` chars, using `head...tail` shape +/// when truncation is needed. Char-boundary safe (operates on `chars`). +pub(crate) fn truncate_url(url: &str, max: usize) -> String { + let total_chars = url.chars().count(); + if total_chars <= max { + return url.to_string(); + } + // Reserve 3 chars for "..." and split the remainder ~70/30 between + // head (path-side) and tail (query-side). + let avail = max.saturating_sub(3); + let head_chars = avail.saturating_sub(17); + let tail_chars = 17; + let head: String = url.chars().take(head_chars).collect(); + let tail: String = url + .chars() + .rev() + .take(tail_chars) + .collect::>() + .into_iter() + .rev() + .collect(); + format!("{head}...{tail}") +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::{Arc, Mutex}; + + /// Collect emitted lines into a `Vec` via a captured writer. + fn capture() -> (Arc>>, impl FnMut(String)) { + let sink: Arc>> = Arc::new(Mutex::new(Vec::new())); + let sink_clone = Arc::clone(&sink); + let writer = move |s: String| { + sink_clone.lock().unwrap().push(s); + }; + (sink, writer) + } + + #[tokio::test] + async fn test_progress_emits_after_interval_elapsed() { + let (sink, writer) = capture(); + // 250ms future, 50ms interval — expect ~4-5 ticks before resolution. + let fut = tokio::time::sleep(Duration::from_millis(250)); + with_progress_writer( + "https://example.com/slow", + async { + fut.await; + 42_i32 + }, + Duration::from_millis(50), + writer, + ) + .await; + let lines = sink.lock().unwrap(); + assert!( + !lines.is_empty(), + "expected >=1 progress line; got {} ({:?})", + lines.len(), + *lines + ); + for line in lines.iter() { + assert!( + line.starts_with("# webclaw: still fetching"), + "line shape wrong: {line:?}" + ); + assert!( + line.contains("https://example.com/slow"), + "url missing from line: {line:?}" + ); + } + } + + #[tokio::test] + async fn test_progress_silent_on_fast_future() { + let (sink, writer) = capture(); + // 10ms future, 1s interval — zero ticks expected. + let result = with_progress_writer( + "https://example.com/fast", + async { + tokio::time::sleep(Duration::from_millis(10)).await; + "done" + }, + Duration::from_secs(1), + writer, + ) + .await; + assert_eq!(result, "done"); + let lines = sink.lock().unwrap(); + assert_eq!( + lines.len(), + 0, + "expected 0 progress lines on fast future; got {:?}", + *lines + ); + } + + #[tokio::test] + async fn test_progress_line_includes_url() { + let (sink, writer) = capture(); + let target_url = "https://news.ycombinator.com/item?id=12345"; + with_progress_writer( + target_url, + async { + tokio::time::sleep(Duration::from_millis(150)).await; + }, + Duration::from_millis(50), + writer, + ) + .await; + let lines = sink.lock().unwrap(); + assert!(!lines.is_empty(), "expected progress lines"); + assert!( + lines.iter().all(|l| l.contains(target_url)), + "every line should contain the URL: {:?}", + *lines + ); + } + + #[tokio::test] + async fn test_progress_returns_inner_result_ok() { + let (_sink, writer) = capture(); + let r: Result = with_progress_writer( + "https://example.com/", + async { Ok::(7) }, + Duration::from_secs(1), + writer, + ) + .await; + assert_eq!(r, Ok(7)); + } + + #[tokio::test] + async fn test_progress_propagates_error() { + let (_sink, writer) = capture(); + let r: Result = with_progress_writer( + "https://example.com/", + async { Err::("boom".to_string()) }, + Duration::from_secs(1), + writer, + ) + .await; + assert_eq!(r, Err("boom".to_string())); + } + + #[test] + fn test_truncate_url_short_passthrough() { + let url = "https://example.com/"; + assert_eq!(truncate_url(url, 80), url); + } + + #[test] + fn test_truncate_url_long_head_dots_tail() { + let url = "https://www.example.com/very/long/path/segments/with/lots/of/text/and/then?q=some_long_query_string_value_here&other=more&another=thing"; + let truncated = truncate_url(url, 80); + assert!( + truncated.chars().count() <= 80, + "truncated length {} > 80: {truncated:?}", + truncated.chars().count() + ); + assert!( + truncated.contains("..."), + "expected '...' marker in truncated url: {truncated:?}" + ); + assert!( + truncated.starts_with("https://www.example.com/"), + "truncated should start with the URL head: {truncated:?}" + ); + } + + #[test] + fn test_truncate_url_unicode_safe() { + // Cyrillic URL longer than 80 chars — must not panic on a + // mid-codepoint split. + let url = + "https://example.com/путь/к/очень/длинной/странице/с/большим/количеством/кириллицы/тут"; + let truncated = truncate_url(url, 80); + assert!(truncated.is_char_boundary(truncated.len())); + // Roundtrip through chars to confirm valid UTF-8 throughout. + let _: String = truncated.chars().collect(); + } + + #[test] + fn test_format_progress_line_shape() { + let line = format_progress_line("https://example.com/", Duration::from_secs(10)); + assert_eq!(line, "# webclaw: still fetching https://example.com/ (10s)"); + } + + #[test] + fn test_format_progress_line_seconds_only() { + // Sub-second elapsed rounds to 0s, not fractions. (In practice + // the first tick fires at +PROGRESS_INTERVAL so this is mostly + // a defensive shape assertion.) + let line = format_progress_line("https://x/", Duration::from_millis(9_500)); + assert!( + line.ends_with("(9s)"), + "line should end with `(9s)`: {line:?}" + ); + } +} diff --git a/crates/webclaw-fetch/src/tls.rs b/crates/webclaw-fetch/src/tls.rs index c6c2955..02209b3 100644 --- a/crates/webclaw-fetch/src/tls.rs +++ b/crates/webclaw-fetch/src/tls.rs @@ -10,15 +10,24 @@ use std::{borrow::Cow, io, time::Duration}; use wreq::http2::{ Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId, }; -use wreq::tls::{ - AlpnProtocol, AlpsProtocol, CertificateCompressionAlgorithm, ExtensionType, TlsOptions, - TlsVersion, -}; -use wreq::{Client, Emulation}; +use wreq::tls::compress::CertificateCompressor; +use wreq::tls::{AlpnProtocol, AlpsProtocol, ExtensionType, TlsOptions, TlsVersion}; +use wreq::{Client, Emulation, Group, IntoEmulation}; +use wreq_util::emulate::compress::{BrotliCompressor, ZlibCompressor}; use crate::browser::BrowserVariant; use crate::error::FetchError; +// Certificate-compression advertisement per profile. wreq 6.0.0-rc.29 replaced +// the `CertificateCompressionAlgorithm` enum argument with `&dyn +// CertificateCompressor` trait objects; wreq-util ships the concrete zlib/brotli +// implementations. The advertised set (and order) is a TLS fingerprint signal, +// so these mirror the previous enum lists exactly. +static CHROME_CERT_COMPRESSORS: &[&'static dyn CertificateCompressor] = &[&BrotliCompressor]; +static FIREFOX_CERT_COMPRESSORS: &[&'static dyn CertificateCompressor] = + &[&ZlibCompressor, &BrotliCompressor]; +static SAFARI_CERT_COMPRESSORS: &[&'static dyn CertificateCompressor] = &[&ZlibCompressor]; + #[derive(Clone, Default)] struct PublicDnsResolver; @@ -119,14 +128,14 @@ fn chrome_extensions() -> Vec { ExtensionType::PSK_KEY_EXCHANGE_MODES, // 45 ExtensionType::EC_POINT_FORMATS, // 11 ExtensionType::CERT_COMPRESSION, // 27 - ExtensionType::APPLICATION_SETTINGS_NEW, // 17613 (new codepoint, matches alps_use_new_codepoint) - ExtensionType::SUPPORTED_VERSIONS, // 43 - ExtensionType::SIGNATURE_ALGORITHMS, // 13 - ExtensionType::SERVER_NAME, // 0 + ExtensionType::APPLICATION_SETTINGS, // 17613 (new codepoint, matches alps_use_new_codepoint) + ExtensionType::SUPPORTED_VERSIONS, // 43 + ExtensionType::SIGNATURE_ALGORITHMS, // 13 + ExtensionType::SERVER_NAME, // 0 ExtensionType::APPLICATION_LAYER_PROTOCOL_NEGOTIATION, // 16 - ExtensionType::ENCRYPTED_CLIENT_HELLO, // 65037 - ExtensionType::RENEGOTIATE, // 65281 - ExtensionType::EXTENDED_MASTER_SECRET, // 23 + ExtensionType::ENCRYPTED_CLIENT_HELLO, // 65037 + ExtensionType::RENEGOTIATE, // 65281 + ExtensionType::EXTENDED_MASTER_SECRET, // 23 ] } @@ -287,7 +296,7 @@ fn chrome_tls() -> TlsOptions { .alps_protocols([AlpsProtocol::HTTP3, AlpsProtocol::HTTP2]) .alps_use_new_codepoint(true) .aes_hw_override(true) - .certificate_compression_algorithms(&[CertificateCompressionAlgorithm::BROTLI]) + .certificate_compressors(CHROME_CERT_COMPRESSORS) .build() } @@ -304,10 +313,7 @@ fn firefox_tls() -> TlsOptions { .pre_shared_key(true) .enable_ocsp_stapling(true) .enable_signed_cert_timestamps(true) - .certificate_compression_algorithms(&[ - CertificateCompressionAlgorithm::ZLIB, - CertificateCompressionAlgorithm::BROTLI, - ]) + .certificate_compressors(FIREFOX_CERT_COMPRESSORS) .build() } @@ -324,7 +330,7 @@ fn safari_tls() -> TlsOptions { .pre_shared_key(false) .enable_ocsp_stapling(true) .enable_signed_cert_timestamps(true) - .certificate_compression_algorithms(&[CertificateCompressionAlgorithm::ZLIB]) + .certificate_compressors(SAFARI_CERT_COMPRESSORS) .build() } @@ -345,21 +351,23 @@ fn safari_tls() -> TlsOptions { /// `priority: u=0, i`, zstd), replace with the real iOS 26 set. /// 4. `accept-language` preserved from config.extra_headers for locale. fn safari_ios_emulation() -> wreq::Emulation { - use wreq::EmulationFactory; - let mut em = wreq_util::Emulation::SafariIos26.emulation(); + // wreq 6.0.0-rc.29 exposes the `Emulation` fields directly (no `*_mut()` + // accessors) and wreq-util 3.0.0-rc.12 renamed the enum to `Profile` with + // `IntoEmulation::into_emulation` replacing `EmulationFactory::emulation`. + let mut em = wreq_util::Profile::SafariIos26.into_emulation(); - if let Some(tls) = em.tls_options_mut().as_mut() { + if let Some(tls) = em.tls_options.as_mut() { tls.extension_permutation = Some(Cow::Owned(safari_ios_extensions())); } // Only override the priority flag — keep wreq-util's SETTINGS, WINDOW_UPDATE, // and pseudo-order intact. Replacing the whole Http2Options resets SETTINGS // to defaults, which sends only INITIAL_WINDOW_SIZE and fails DataDome. - if let Some(h2) = em.http2_options_mut().as_mut() { + if let Some(h2) = em.http2_options.as_mut() { h2.headers_stream_dependency = Some(StreamDependency::new(StreamId::zero(), 255, true)); } - let hm = em.headers_mut(); + let hm = &mut em.headers; hm.clear(); for (k, v) in SAFARI_IOS_HEADERS { if let (Ok(n), Ok(val)) = ( @@ -508,12 +516,12 @@ pub fn build_client( .tls_options(tls) .http2_options(h2) .headers(build_headers(headers)) - .build() + .build(Group::default()) } }; // Append extra headers after profile defaults. - let hm = emulation.headers_mut(); + let hm = &mut emulation.headers; for (k, v) in extra_headers { if let (Ok(n), Ok(val)) = ( http::header::HeaderName::from_bytes(k.as_bytes()),