Compare commits

...

8 commits
v0.6.5 ... main

Author SHA1 Message Date
Valerio
d0d7b835f2 docs(readme): update banner to new webclaw branding 2026-06-09 18:53:14 +02:00
Valerio
6519ac2a8b chore(release): v0.6.7 2026-06-09 12:38:03 +02:00
Valerio
14ded4b99e chore(deps): bump wreq 6.0.0-rc.29, wreq-util 3.0.0-rc.12
Ports the TLS/Response API breaks in the bump:
- certificate_compression_algorithms -> certificate_compressors with
  wreq-util's BrotliCompressor/ZlibCompressor trait objects
- ExtensionType::APPLICATION_SETTINGS_NEW -> APPLICATION_SETTINGS (same
  codepoint 17613)
- wreq_util::Emulation::SafariIos26.emulation() ->
  Profile::SafariIos26.into_emulation(); Emulation fields are now public
  so *_mut() accessors become direct field access; build() takes a Group
- Response::chunk() removed -> bytes_stream() (wreq 'stream' feature) with
  the running body-size ceiling preserved; adds futures-util

Browser fingerprints verified unchanged on tls.peet.ws: Chrome JA3
43067709b025da334de1279a120f8e14, Safari iOS JA3 8d909525bd5bbb79f133d11cc05159fe.
2026-06-09 12:38:03 +02:00
Valerio
72a451cfb6 chore(release): sync Cargo.lock to v0.6.6 2026-06-09 11:26:18 +02:00
Valerio
17fce81a95 chore(release): v0.6.6
Salvaged two CLI ergonomics fixes from #49:
- periodic progress line on slow fetches (stderr)
- --url-encoded flag + URL truncation warning
2026-06-09 11:24:13 +02:00
Valerio
84a0f9774d style: apply rustfmt to salvaged #49 commits 2026-06-09 11:24:13 +02:00
devnen
519dfb7864 feat(cli): URL truncation warning + --url-encoded flag
When bash splits a URL at & or ? (a common foot-gun), webclaw
receives only the truncated prefix and silently fetches the wrong
page. Per issue #6:

1. Heuristic warning: if the URL ends with '&' or contains '?' with
   no '=' after, emit a stderr warning before fetching:
     # webclaw: warning: URL looks truncated (ends with '&' or '?'); did the shell split it? Quote the URL or use --url-encoded.

2. New flag --url-encoded: parallel input that asserts the user has
   handled escaping. Suppresses the truncation warning since intent
   is explicit.

Fetch proceeds in both cases; this is informational only. 4 new
tests in webclaw-cli. Workspace 720 -> 724.

(cherry picked from commit 4ef27fcd33)
2026-06-09 11:24:13 +02:00
devnen
985a90b083 feat(fetch): periodic progress stderr line on slow fetches
Webclaw's default -t timeout is 30s; slow sites previously sat
silently with no feedback. Now during a fetch, every 10s of elapsed
time webclaw writes one line to stderr:

  # webclaw: still fetching <URL> (Ns)

Fetches completing in under 10s emit nothing (the timer never fires).
Stdout output is untouched - pure feedback signal on stderr.

No timeout change. No new flags. Default behavior is augmented at
stderr only.

Implemented via tokio::select! between the fetch future and a
tokio::time::interval. Latency cost: a single tokio task spawn
and a 10s tick - microseconds on the fast path.

10 new tests in webclaw-fetch::progress::tests (none ignored; the
slow-future test uses a 50ms test interval to keep cargo test fast).
Workspace total 710 -> 720.

(cherry picked from commit 06f065cb08)
2026-06-09 11:24:13 +02:00
10 changed files with 589 additions and 132 deletions

BIN
.github/banner.png vendored

Binary file not shown.

Before

Width:  |  Height:  |  Size: 44 KiB

After

Width:  |  Height:  |  Size: 48 KiB

Before After
Before After

View file

@ -3,6 +3,21 @@
All notable changes to webclaw are documented here.
Format follows [Keep a Changelog](https://keepachangelog.com/).
## [0.6.7] — 2026-06-09
### Changed
- Updated the HTTP/TLS engine (wreq 6.0.0-rc.29, wreq-util 3.0.0-rc.12). This pulls in upstream robustness fixes: no more panic on responses with non-UTF8 header values, a fix for short reads when decoding large compressed bodies, and the TCP nodelay setting is restored. Browser TLS fingerprints are unchanged.
---
## [0.6.6] — 2026-06-09
### Added
- Slow fetches now print a progress line to stderr every 10 seconds (`# webclaw: still fetching <url> (Ns)`) so a long request no longer looks like the CLI hung. Fast fetches stay silent and stdout is untouched.
- New `--url-encoded` flag plus a warning when a URL looks like the shell split it on `&` or `?`. The warning suggests quoting the URL; pass `--url-encoded` to silence it when the URL is intentional.
---
## [0.6.5] — 2026-06-04
### Changed

221
Cargo.lock generated
View file

@ -28,18 +28,6 @@ dependencies = [
"cpufeatures",
]
[[package]]
name = "ahash"
version = "0.8.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
dependencies = [
"cfg-if",
"once_cell",
"version_check",
"zerocopy",
]
[[package]]
name = "aho-corasick"
version = "1.1.4"
@ -64,6 +52,12 @@ dependencies = [
"alloc-no-stdlib",
]
[[package]]
name = "allocator-api2"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
[[package]]
name = "android_system_properties"
version = "0.1.5"
@ -272,9 +266,9 @@ dependencies = [
[[package]]
name = "bitflags"
version = "2.11.0"
version = "2.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af"
checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8"
[[package]]
name = "block-buffer"
@ -285,31 +279,6 @@ dependencies = [
"generic-array",
]
[[package]]
name = "boring-sys2"
version = "5.0.0-alpha.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "455d79965f5155dcc88a7abce112c3590883889131b799beda10bf9a813ed669"
dependencies = [
"bindgen",
"cmake",
"fs_extra",
"fslock",
]
[[package]]
name = "boring2"
version = "5.0.0-alpha.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "183ccc3854411c035410dcdbffafca62084f3a6c33f013c77e83c025d2a08a28"
dependencies = [
"bitflags",
"boring-sys2",
"foreign-types",
"libc",
"openssl-macros",
]
[[package]]
name = "brotli"
version = "8.0.2"
@ -331,6 +300,31 @@ dependencies = [
"alloc-stdlib",
]
[[package]]
name = "btls"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c5e60b8c8d282c86360cab651ded04ab0335a7b5390c8d34145cbeab8cacf5f"
dependencies = [
"bitflags",
"btls-sys",
"foreign-types",
"libc",
"openssl-macros",
]
[[package]]
name = "btls-sys"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b1b8638a2e1c38a5ae4efa90ae57e643baec35a30d03fc5b399b893adc4954b"
dependencies = [
"bindgen",
"cmake",
"fs_extra",
"fslock",
]
[[package]]
name = "bumpalo"
version = "3.20.2"
@ -865,6 +859,12 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "foldhash"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
[[package]]
name = "foreign-types"
version = "0.5.0"
@ -1089,19 +1089,13 @@ version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
[[package]]
name = "hashbrown"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e"
[[package]]
name = "hashbrown"
version = "0.15.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
dependencies = [
"foldhash",
"foldhash 0.1.5",
]
[[package]]
@ -1110,6 +1104,17 @@ version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
[[package]]
name = "hashbrown"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash 0.2.0",
]
[[package]]
name = "heck"
version = "0.5.0"
@ -1172,9 +1177,9 @@ dependencies = [
[[package]]
name = "http2"
version = "0.5.15"
version = "0.5.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c45c6490693ee8a8d0d95fdbdf76fead9fb87548f7894137259a7c6d22821948"
checksum = "569ef7a780e853c4e1768f58a3c8168193b82cdcbab66638a0b1c6583ec5995e"
dependencies = [
"atomic-waker",
"bytes",
@ -1183,7 +1188,6 @@ dependencies = [
"futures-sink",
"http",
"indexmap",
"parking_lot",
"slab",
"smallvec",
"tokio",
@ -1495,9 +1499,9 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
[[package]]
name = "libc"
version = "0.2.183"
version = "0.2.186"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d"
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
[[package]]
name = "libloading"
@ -1563,6 +1567,15 @@ dependencies = [
"weezl",
]
[[package]]
name = "lru"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a860605968fce16869fd239cf4237a82f3ac470723415db603b0e8b6c8d4fb9"
dependencies = [
"hashbrown 0.17.1",
]
[[package]]
name = "lru-slab"
version = "0.1.2"
@ -2375,17 +2388,6 @@ dependencies = [
"syn",
]
[[package]]
name = "schnellru"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "356285bbf17bea63d9e52e96bd18f039672ac92b55b8cb997d6162a2a37d1649"
dependencies = [
"ahash",
"cfg-if",
"hashbrown 0.13.2",
]
[[package]]
name = "scopeguard"
version = "1.2.0"
@ -2779,9 +2781,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tokio"
version = "1.50.0"
version = "1.52.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d"
checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe"
dependencies = [
"bytes",
"libc",
@ -2795,20 +2797,20 @@ dependencies = [
]
[[package]]
name = "tokio-boring2"
version = "5.0.0-alpha.13"
name = "tokio-btls"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f81df1210d791f31d72d840de8fbd80b9c3cb324956523048b1413e2bd55756"
checksum = "2e1fd638ec35427faf3b8f412e0fdd6fae76591d79dba40f38fa667d22bc44dd"
dependencies = [
"boring2",
"btls",
"tokio",
]
[[package]]
name = "tokio-macros"
version = "2.6.1"
version = "2.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c"
checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496"
dependencies = [
"proc-macro2",
"quote",
@ -3219,7 +3221,7 @@ dependencies = [
[[package]]
name = "webclaw-cli"
version = "0.6.5"
version = "0.6.7"
dependencies = [
"clap",
"dotenvy",
@ -3240,7 +3242,7 @@ dependencies = [
[[package]]
name = "webclaw-core"
version = "0.6.5"
version = "0.6.7"
dependencies = [
"ego-tree",
"once_cell",
@ -3258,11 +3260,12 @@ dependencies = [
[[package]]
name = "webclaw-fetch"
version = "0.6.5"
version = "0.6.7"
dependencies = [
"async-trait",
"bytes",
"calamine",
"futures-util",
"http",
"quick-xml 0.37.5",
"rand 0.8.5",
@ -3284,7 +3287,7 @@ dependencies = [
[[package]]
name = "webclaw-llm"
version = "0.6.5"
version = "0.6.7"
dependencies = [
"async-trait",
"reqwest",
@ -3297,7 +3300,7 @@ dependencies = [
[[package]]
name = "webclaw-mcp"
version = "0.6.5"
version = "0.6.7"
dependencies = [
"dirs",
"dotenvy",
@ -3317,7 +3320,7 @@ dependencies = [
[[package]]
name = "webclaw-pdf"
version = "0.6.5"
version = "0.6.7"
dependencies = [
"pdf-extract",
"thiserror",
@ -3326,7 +3329,7 @@ dependencies = [
[[package]]
name = "webclaw-server"
version = "0.6.5"
version = "0.6.7"
dependencies = [
"anyhow",
"axum",
@ -3347,9 +3350,9 @@ dependencies = [
[[package]]
name = "webpki-root-certs"
version = "1.0.6"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca"
checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c"
dependencies = [
"rustls-pki-types",
]
@ -3696,17 +3699,14 @@ dependencies = [
[[package]]
name = "wreq"
version = "6.0.0-rc.28"
version = "6.0.0-rc.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f79937f6c4df65b3f6f78715b9de2977afe9ee3b3436483c7949a24511e25935"
checksum = "3f0eba5f5814a94e5f1a99156f187133464e525b66bdbc69a9627d46530af2e1"
dependencies = [
"ahash",
"boring2",
"brotli",
"btls",
"btls-sys",
"bytes",
"cookie",
"flate2",
"futures-channel",
"futures-util",
"http",
"http-body",
@ -3715,29 +3715,64 @@ dependencies = [
"httparse",
"ipnet",
"libc",
"lru",
"percent-encoding",
"pin-project-lite",
"schnellru",
"smallvec",
"socket2",
"sync_wrapper",
"tokio",
"tokio-boring2",
"tokio-btls",
"tokio-util",
"tower",
"tower-http",
"url",
"want",
"webpki-root-certs",
"zstd",
"wreq-proto",
"wreq-rt",
]
[[package]]
name = "wreq-proto"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a43942f024bb303f1042c9aa3c87fa1d9149f507c65db6e5220a11ccdb207387"
dependencies = [
"bytes",
"futures-channel",
"futures-util",
"http",
"http-body",
"http2",
"httparse",
"pin-project-lite",
"smallvec",
"tokio",
"tokio-util",
"want",
]
[[package]]
name = "wreq-rt"
version = "0.2.2-rc.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99e9bce67a3fa3dd3f1503f066d86661c9caf399a763d3bd184da7afaf886c8b"
dependencies = [
"pin-project-lite",
"tokio",
"wreq-proto",
]
[[package]]
name = "wreq-util"
version = "3.0.0-rc.10"
version = "3.0.0-rc.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c6bbe24d28beb9ceb58b514bd6a613c759d3b706f768b9d2950d5d35b543c04"
checksum = "baa5d2ab72139256916ca352a3d05c53d74e1dd360052eb5ba7691033c417c65"
dependencies = [
"brotli",
"flate2",
"typed-builder",
"wreq",
"zstd",
]
[[package]]

View file

@ -3,7 +3,7 @@ resolver = "2"
members = ["crates/*"]
[workspace.package]
version = "0.6.5"
version = "0.6.7"
edition = "2024"
license = "AGPL-3.0"
repository = "https://github.com/0xMassi/webclaw"

View file

@ -166,6 +166,14 @@ struct Cli {
#[arg(long)]
urls_file: Option<String>,
/// Assert that the URL has been handled for shell escaping. Suppresses
/// the URL-truncation stderr warning. Use when the URL is intentionally
/// passed with an empty/keyless query (e.g. legacy CGI) or when a
/// trailing `&` is genuinely part of the URL. The URL is fetched as-is
/// (no extra normalization beyond the standard scheme prepend).
#[arg(long)]
url_encoded: bool,
/// Output format (markdown, json, text, llm, html)
#[arg(short, long, default_value = "markdown")]
format: OutputFormat,
@ -591,6 +599,31 @@ fn normalize_url(url: &str) -> String {
}
}
/// M14: detect URLs that look truncated by the shell (e.g. an unquoted URL
/// that the shell split on `&` or `?`). Returns `true` when:
/// - the URL ends with `&` (a trailing param separator suggests the next
/// param was lopped off), OR
/// - the URL contains `?` but no `=` after it (a query with bare keys is
/// rare; usually a real query has at least one `=`).
///
/// Informational only — caller decides whether to warn / abort. This is a
/// heuristic; legitimate URLs with bare-key queries will trigger a false
/// positive (suppressible via `--url-encoded`).
fn looks_truncated(url: &str) -> bool {
let trimmed = url.trim();
if trimmed.ends_with('&') {
return true;
}
if let Some((_before, after_q)) = trimmed.split_once('?') {
// Trim a trailing fragment so `?#section` etc. doesn't mask the check.
let query_part = after_q.split('#').next().unwrap_or(after_q);
if !query_part.contains('=') {
return true;
}
}
false
}
/// Derive a filename from a URL for `--output-dir`.
///
/// Strips the scheme/host, maps the path to a filesystem path, and appends
@ -826,6 +859,14 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
.urls
.first()
.ok_or("no input provided -- pass a URL, --file, or --stdin")?;
// M14: warn when the URL looks like the shell split it on `&` or `?`.
// Informational only — fetch still proceeds. Suppressed by --url-encoded,
// which asserts the caller has handled escaping intentionally.
if !cli.url_encoded && looks_truncated(raw_url) {
eprintln!(
"# webclaw: warning: URL looks truncated (ends with '&' or '?'); did the shell split it? Quote the URL or use --url-encoded."
);
}
let url = normalize_url(raw_url);
let url = url.as_str();
@ -859,8 +900,11 @@ async fn fetch_and_extract(cli: &Cli) -> Result<FetchOutput, String> {
let client =
FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?;
let options = build_extraction_options(cli);
let result = client
.fetch_and_extract_with_options(url, &options)
// M13: wrap with periodic stderr progress emitter. Fast fetches see
// zero emissions (timer never fires in <10s); slow fetches get a
// line every 10s of elapsed time so the CLI doesn't appear hung.
let fetch_fut = client.fetch_and_extract_with_options(url, &options);
let result = webclaw_fetch::with_progress(url, fetch_fut)
.await
.map_err(|e| format!("fetch error: {e}"))?;
@ -2879,6 +2923,61 @@ mod tests {
let _ = std::fs::remove_dir_all(&dir);
}
// M14: URL truncation heuristic tests.
#[test]
fn looks_truncated_fires_on_trailing_ampersand() {
// The most common shell-split shape: `?a=1&` lost the `b=2`.
assert!(looks_truncated("https://example.com/?a=1&"));
assert!(looks_truncated("https://example.com/path?key=val&"));
}
#[test]
fn looks_truncated_fires_on_query_with_no_equals() {
// `?foo` with no `=` is a strong signal the shell ate the `=value`.
assert!(looks_truncated("https://example.com/?foo"));
// Bare `?` (empty query) also looks like the shell ate the whole pair.
assert!(looks_truncated("https://example.com/?"));
// Same with a fragment after — strip fragment before checking.
assert!(looks_truncated("https://example.com/?foo#section"));
}
#[test]
fn looks_truncated_silent_on_clean_url() {
// Normal URLs (no query, or query with at least one `=`) are clean.
assert!(!looks_truncated("https://example.com/"));
assert!(!looks_truncated("https://example.com/path/to/page"));
assert!(!looks_truncated("https://example.com/?a=1"));
assert!(!looks_truncated("https://example.com/?a=1&b=2"));
assert!(!looks_truncated(
"https://example.com/?a=1&b=2&c=hello%20world"
));
// Hash anchors without a query are clean.
assert!(!looks_truncated("https://example.com/page#section"));
}
#[test]
fn looks_truncated_silent_with_url_encoded_assertion_modeled_via_skip() {
// The --url-encoded flag suppresses the warning at the call site
// (main.rs gates the eprintln! behind `if !cli.url_encoded`).
// This test models the gate logic directly: when --url-encoded is set,
// the warning branch is never entered, even on a truncated-looking URL.
let url = "https://example.com/?a=1&";
let url_encoded_flag = true;
let should_warn = !url_encoded_flag && looks_truncated(url);
assert!(
!should_warn,
"--url-encoded must suppress the warning even on URL ending with &"
);
// Sanity: same URL without --url-encoded does warn.
let url_encoded_flag = false;
let should_warn = !url_encoded_flag && looks_truncated(url);
assert!(
should_warn,
"without --url-encoded, the warning should fire on URL ending with &"
);
}
#[test]
fn research_slug_truncation_is_char_safe() {
// Multibyte query: byte-slicing at 50 would panic mid-codepoint.

View file

@ -14,13 +14,16 @@ tracing = { workspace = true }
tokio = { workspace = true }
async-trait = "0.1"
# Pinned to exact pre-release versions: wreq/wreq-util are release candidates
# with no semver stability between rc.N builds (rc.29 broke the TLS + Response
# API). An exact pin keeps `cargo build`, `cargo install` (which ignores
# Cargo.lock), and the release workflow all on the version that compiles.
wreq = { version = "=6.0.0-rc.28", features = ["cookies", "gzip", "brotli", "zstd", "deflate"] }
wreq-util = "=3.0.0-rc.10"
# with no semver stability between rc.N builds. An exact pin keeps `cargo build`,
# `cargo install` (which ignores Cargo.lock), and the release workflow all on the
# version that compiles.
wreq = { version = "=6.0.0-rc.29", features = ["cookies", "gzip", "brotli", "zstd", "deflate", "stream"] }
wreq-util = "=3.0.0-rc.12"
http = "1"
bytes = "1"
# Stream adapter for `wreq::Response::bytes_stream()` (wreq 6.0.0-rc.29 dropped
# `Response::chunk()`); used to buffer bodies under the running size ceiling.
futures-util = "0.3"
url = "2"
rand = "0.8"
quick-xml = { version = "0.37", features = ["serde"] }

View file

@ -12,6 +12,7 @@ use std::hash::{Hash, Hasher};
use std::sync::Arc;
use std::time::{Duration, Instant};
use futures_util::StreamExt;
use rand::seq::SliceRandom;
use tokio::sync::Semaphore;
use tracing::{debug, instrument, warn};
@ -118,7 +119,7 @@ impl Response {
/// negotiated), so a tiny compressed payload that inflates to
/// gigabytes is aborted as soon as the accumulated size crosses the
/// cap — it never gets fully buffered in memory.
async fn from_wreq(mut resp: wreq::Response) -> Result<Self, FetchError> {
async fn from_wreq(resp: wreq::Response) -> Result<Self, FetchError> {
if let Some(len) = resp.content_length()
&& len > MAX_BODY_BYTES
{
@ -130,12 +131,13 @@ impl Response {
let url = resp.uri().to_string();
let headers = resp.headers().clone();
// wreq 6.0.0-rc.29 dropped `Response::chunk()`. Stream post-decompression
// bytes via `bytes_stream()` and keep enforcing the running ceiling so a
// compression bomb is aborted before it is fully buffered in memory.
let mut buf = bytes::BytesMut::new();
while let Some(chunk) = resp
.chunk()
.await
.map_err(|e| FetchError::BodyDecode(e.to_string()))?
{
let mut stream = resp.bytes_stream();
while let Some(chunk) = stream.next().await {
let chunk = chunk.map_err(|e| FetchError::BodyDecode(e.to_string()))?;
check_body_ceiling(buf.len(), chunk.len())?;
buf.extend_from_slice(&chunk);
}

View file

@ -11,6 +11,7 @@ pub mod extractors;
pub mod fetcher;
pub mod linkedin;
pub mod locale;
pub mod progress;
pub mod proxy;
pub mod reddit;
pub mod sitemap;
@ -24,6 +25,7 @@ pub use error::FetchError;
pub use fetcher::Fetcher;
pub use http::HeaderMap;
pub use locale::{accept_language_for_tld, accept_language_for_url};
pub use progress::{PROGRESS_INTERVAL, with_progress};
pub use proxy::{parse_proxy_file, parse_proxy_line};
pub use sitemap::SitemapEntry;
pub use webclaw_pdf::PdfMode;

View file

@ -0,0 +1,293 @@
//! Periodic stderr progress line emitter for slow fetches (M13).
//!
//! Wraps any async fetch future with a `tokio::select!` against a
//! `tokio::time::interval`. Every `PROGRESS_INTERVAL` (default 10s) of
//! elapsed time, emits one line to STDERR of the form:
//!
//! ```text
//! # webclaw: still fetching <URL> (Ns)
//! ```
//!
//! Fetches completing in under `PROGRESS_INTERVAL` emit zero lines (the
//! timer never fires). Stdout is untouched.
//!
//! The URL is truncated to at most 80 chars (head + `...` + tail) so
//! pathological query strings don't blow up the stderr line. Truncation
//! is char-boundary safe (operates on `chars`, not bytes).
use std::future::Future;
use std::time::Duration;
use tokio::time::{Instant, MissedTickBehavior, interval};
/// Default progress emission interval. The first tick fires at +10s
/// elapsed; subsequent ticks at +20s, +30s, etc.
pub const PROGRESS_INTERVAL: Duration = Duration::from_secs(10);
/// Maximum URL length in the progress line. Longer URLs are truncated
/// `head...tail` style.
const MAX_URL_LEN: usize = 80;
/// Wrap a fetch future with the default 10s progress emitter. Writes
/// progress lines to STDERR via `eprintln!`. Returns the inner future's
/// result unchanged.
pub async fn with_progress<F, T>(url: &str, future: F) -> T
where
F: Future<Output = T>,
{
with_progress_writer(url, future, PROGRESS_INTERVAL, |s| eprintln!("{s}")).await
}
/// Test-friendly variant of [`with_progress`]: caller supplies the tick
/// interval (so tests can use a 50ms period instead of 10s) and a
/// writer closure (so tests can capture emitted lines without touching
/// real stderr).
///
/// Production code uses [`with_progress`] which delegates here with
/// [`PROGRESS_INTERVAL`] and an `eprintln!` writer.
pub async fn with_progress_writer<F, T, W>(
url: &str,
future: F,
period: Duration,
mut writer: W,
) -> T
where
F: Future<Output = T>,
W: FnMut(String),
{
let start = Instant::now();
let mut ticker = interval(period);
// First tick of `tokio::time::interval(period)` fires *immediately*
// (at construction time). We don't want a t=0 emit — consume that
// first tick before entering the select loop. Subsequent ticks fire
// at `start + period`, `start + 2*period`, ...
ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
ticker.tick().await;
tokio::pin!(future);
loop {
tokio::select! {
// Bias toward the future — if both are ready (rare), prefer
// returning the result over emitting a final tick.
biased;
result = &mut future => {
return result;
}
_ = ticker.tick() => {
let elapsed = start.elapsed();
writer(format_progress_line(url, elapsed));
}
}
}
}
/// Build the progress line: `# webclaw: still fetching <URL> (Ns)`.
/// URL is truncated via [`truncate_url`] to [`MAX_URL_LEN`] chars.
/// Elapsed is rounded to whole seconds (10, 20, 30, ...).
pub(crate) fn format_progress_line(url: &str, elapsed: Duration) -> String {
let truncated = truncate_url(url, MAX_URL_LEN);
let secs = elapsed.as_secs();
format!("# webclaw: still fetching {truncated} ({secs}s)")
}
/// Truncate `url` to at most `max` chars, using `head...tail` shape
/// when truncation is needed. Char-boundary safe (operates on `chars`).
pub(crate) fn truncate_url(url: &str, max: usize) -> String {
let total_chars = url.chars().count();
if total_chars <= max {
return url.to_string();
}
// Reserve 3 chars for "..." and split the remainder ~70/30 between
// head (path-side) and tail (query-side).
let avail = max.saturating_sub(3);
let head_chars = avail.saturating_sub(17);
let tail_chars = 17;
let head: String = url.chars().take(head_chars).collect();
let tail: String = url
.chars()
.rev()
.take(tail_chars)
.collect::<Vec<_>>()
.into_iter()
.rev()
.collect();
format!("{head}...{tail}")
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::{Arc, Mutex};
/// Collect emitted lines into a `Vec<String>` via a captured writer.
fn capture() -> (Arc<Mutex<Vec<String>>>, impl FnMut(String)) {
let sink: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
let sink_clone = Arc::clone(&sink);
let writer = move |s: String| {
sink_clone.lock().unwrap().push(s);
};
(sink, writer)
}
#[tokio::test]
async fn test_progress_emits_after_interval_elapsed() {
let (sink, writer) = capture();
// 250ms future, 50ms interval — expect ~4-5 ticks before resolution.
let fut = tokio::time::sleep(Duration::from_millis(250));
with_progress_writer(
"https://example.com/slow",
async {
fut.await;
42_i32
},
Duration::from_millis(50),
writer,
)
.await;
let lines = sink.lock().unwrap();
assert!(
!lines.is_empty(),
"expected >=1 progress line; got {} ({:?})",
lines.len(),
*lines
);
for line in lines.iter() {
assert!(
line.starts_with("# webclaw: still fetching"),
"line shape wrong: {line:?}"
);
assert!(
line.contains("https://example.com/slow"),
"url missing from line: {line:?}"
);
}
}
#[tokio::test]
async fn test_progress_silent_on_fast_future() {
let (sink, writer) = capture();
// 10ms future, 1s interval — zero ticks expected.
let result = with_progress_writer(
"https://example.com/fast",
async {
tokio::time::sleep(Duration::from_millis(10)).await;
"done"
},
Duration::from_secs(1),
writer,
)
.await;
assert_eq!(result, "done");
let lines = sink.lock().unwrap();
assert_eq!(
lines.len(),
0,
"expected 0 progress lines on fast future; got {:?}",
*lines
);
}
#[tokio::test]
async fn test_progress_line_includes_url() {
let (sink, writer) = capture();
let target_url = "https://news.ycombinator.com/item?id=12345";
with_progress_writer(
target_url,
async {
tokio::time::sleep(Duration::from_millis(150)).await;
},
Duration::from_millis(50),
writer,
)
.await;
let lines = sink.lock().unwrap();
assert!(!lines.is_empty(), "expected progress lines");
assert!(
lines.iter().all(|l| l.contains(target_url)),
"every line should contain the URL: {:?}",
*lines
);
}
#[tokio::test]
async fn test_progress_returns_inner_result_ok() {
let (_sink, writer) = capture();
let r: Result<i32, String> = with_progress_writer(
"https://example.com/",
async { Ok::<i32, String>(7) },
Duration::from_secs(1),
writer,
)
.await;
assert_eq!(r, Ok(7));
}
#[tokio::test]
async fn test_progress_propagates_error() {
let (_sink, writer) = capture();
let r: Result<i32, String> = with_progress_writer(
"https://example.com/",
async { Err::<i32, String>("boom".to_string()) },
Duration::from_secs(1),
writer,
)
.await;
assert_eq!(r, Err("boom".to_string()));
}
#[test]
fn test_truncate_url_short_passthrough() {
let url = "https://example.com/";
assert_eq!(truncate_url(url, 80), url);
}
#[test]
fn test_truncate_url_long_head_dots_tail() {
let url = "https://www.example.com/very/long/path/segments/with/lots/of/text/and/then?q=some_long_query_string_value_here&other=more&another=thing";
let truncated = truncate_url(url, 80);
assert!(
truncated.chars().count() <= 80,
"truncated length {} > 80: {truncated:?}",
truncated.chars().count()
);
assert!(
truncated.contains("..."),
"expected '...' marker in truncated url: {truncated:?}"
);
assert!(
truncated.starts_with("https://www.example.com/"),
"truncated should start with the URL head: {truncated:?}"
);
}
#[test]
fn test_truncate_url_unicode_safe() {
// Cyrillic URL longer than 80 chars — must not panic on a
// mid-codepoint split.
let url =
"https://example.com/путь/к/очень/длинной/странице/с/большим/количеством/кириллицы/тут";
let truncated = truncate_url(url, 80);
assert!(truncated.is_char_boundary(truncated.len()));
// Roundtrip through chars to confirm valid UTF-8 throughout.
let _: String = truncated.chars().collect();
}
#[test]
fn test_format_progress_line_shape() {
let line = format_progress_line("https://example.com/", Duration::from_secs(10));
assert_eq!(line, "# webclaw: still fetching https://example.com/ (10s)");
}
#[test]
fn test_format_progress_line_seconds_only() {
// Sub-second elapsed rounds to 0s, not fractions. (In practice
// the first tick fires at +PROGRESS_INTERVAL so this is mostly
// a defensive shape assertion.)
let line = format_progress_line("https://x/", Duration::from_millis(9_500));
assert!(
line.ends_with("(9s)"),
"line should end with `(9s)`: {line:?}"
);
}
}

View file

@ -10,15 +10,24 @@ use std::{borrow::Cow, io, time::Duration};
use wreq::http2::{
Http2Options, PseudoId, PseudoOrder, SettingId, SettingsOrder, StreamDependency, StreamId,
};
use wreq::tls::{
AlpnProtocol, AlpsProtocol, CertificateCompressionAlgorithm, ExtensionType, TlsOptions,
TlsVersion,
};
use wreq::{Client, Emulation};
use wreq::tls::compress::CertificateCompressor;
use wreq::tls::{AlpnProtocol, AlpsProtocol, ExtensionType, TlsOptions, TlsVersion};
use wreq::{Client, Emulation, Group, IntoEmulation};
use wreq_util::emulate::compress::{BrotliCompressor, ZlibCompressor};
use crate::browser::BrowserVariant;
use crate::error::FetchError;
// Certificate-compression advertisement per profile. wreq 6.0.0-rc.29 replaced
// the `CertificateCompressionAlgorithm` enum argument with `&dyn
// CertificateCompressor` trait objects; wreq-util ships the concrete zlib/brotli
// implementations. The advertised set (and order) is a TLS fingerprint signal,
// so these mirror the previous enum lists exactly.
static CHROME_CERT_COMPRESSORS: &[&'static dyn CertificateCompressor] = &[&BrotliCompressor];
static FIREFOX_CERT_COMPRESSORS: &[&'static dyn CertificateCompressor] =
&[&ZlibCompressor, &BrotliCompressor];
static SAFARI_CERT_COMPRESSORS: &[&'static dyn CertificateCompressor] = &[&ZlibCompressor];
#[derive(Clone, Default)]
struct PublicDnsResolver;
@ -119,14 +128,14 @@ fn chrome_extensions() -> Vec<ExtensionType> {
ExtensionType::PSK_KEY_EXCHANGE_MODES, // 45
ExtensionType::EC_POINT_FORMATS, // 11
ExtensionType::CERT_COMPRESSION, // 27
ExtensionType::APPLICATION_SETTINGS_NEW, // 17613 (new codepoint, matches alps_use_new_codepoint)
ExtensionType::SUPPORTED_VERSIONS, // 43
ExtensionType::SIGNATURE_ALGORITHMS, // 13
ExtensionType::SERVER_NAME, // 0
ExtensionType::APPLICATION_SETTINGS, // 17613 (new codepoint, matches alps_use_new_codepoint)
ExtensionType::SUPPORTED_VERSIONS, // 43
ExtensionType::SIGNATURE_ALGORITHMS, // 13
ExtensionType::SERVER_NAME, // 0
ExtensionType::APPLICATION_LAYER_PROTOCOL_NEGOTIATION, // 16
ExtensionType::ENCRYPTED_CLIENT_HELLO, // 65037
ExtensionType::RENEGOTIATE, // 65281
ExtensionType::EXTENDED_MASTER_SECRET, // 23
ExtensionType::ENCRYPTED_CLIENT_HELLO, // 65037
ExtensionType::RENEGOTIATE, // 65281
ExtensionType::EXTENDED_MASTER_SECRET, // 23
]
}
@ -287,7 +296,7 @@ fn chrome_tls() -> TlsOptions {
.alps_protocols([AlpsProtocol::HTTP3, AlpsProtocol::HTTP2])
.alps_use_new_codepoint(true)
.aes_hw_override(true)
.certificate_compression_algorithms(&[CertificateCompressionAlgorithm::BROTLI])
.certificate_compressors(CHROME_CERT_COMPRESSORS)
.build()
}
@ -304,10 +313,7 @@ fn firefox_tls() -> TlsOptions {
.pre_shared_key(true)
.enable_ocsp_stapling(true)
.enable_signed_cert_timestamps(true)
.certificate_compression_algorithms(&[
CertificateCompressionAlgorithm::ZLIB,
CertificateCompressionAlgorithm::BROTLI,
])
.certificate_compressors(FIREFOX_CERT_COMPRESSORS)
.build()
}
@ -324,7 +330,7 @@ fn safari_tls() -> TlsOptions {
.pre_shared_key(false)
.enable_ocsp_stapling(true)
.enable_signed_cert_timestamps(true)
.certificate_compression_algorithms(&[CertificateCompressionAlgorithm::ZLIB])
.certificate_compressors(SAFARI_CERT_COMPRESSORS)
.build()
}
@ -345,21 +351,23 @@ fn safari_tls() -> TlsOptions {
/// `priority: u=0, i`, zstd), replace with the real iOS 26 set.
/// 4. `accept-language` preserved from config.extra_headers for locale.
fn safari_ios_emulation() -> wreq::Emulation {
use wreq::EmulationFactory;
let mut em = wreq_util::Emulation::SafariIos26.emulation();
// wreq 6.0.0-rc.29 exposes the `Emulation` fields directly (no `*_mut()`
// accessors) and wreq-util 3.0.0-rc.12 renamed the enum to `Profile` with
// `IntoEmulation::into_emulation` replacing `EmulationFactory::emulation`.
let mut em = wreq_util::Profile::SafariIos26.into_emulation();
if let Some(tls) = em.tls_options_mut().as_mut() {
if let Some(tls) = em.tls_options.as_mut() {
tls.extension_permutation = Some(Cow::Owned(safari_ios_extensions()));
}
// Only override the priority flag — keep wreq-util's SETTINGS, WINDOW_UPDATE,
// and pseudo-order intact. Replacing the whole Http2Options resets SETTINGS
// to defaults, which sends only INITIAL_WINDOW_SIZE and fails DataDome.
if let Some(h2) = em.http2_options_mut().as_mut() {
if let Some(h2) = em.http2_options.as_mut() {
h2.headers_stream_dependency = Some(StreamDependency::new(StreamId::zero(), 255, true));
}
let hm = em.headers_mut();
let hm = &mut em.headers;
hm.clear();
for (k, v) in SAFARI_IOS_HEADERS {
if let (Ok(n), Ok(val)) = (
@ -508,12 +516,12 @@ pub fn build_client(
.tls_options(tls)
.http2_options(h2)
.headers(build_headers(headers))
.build()
.build(Group::default())
}
};
// Append extra headers after profile defaults.
let hm = emulation.headers_mut();
let hm = &mut emulation.headers;
for (k, v) in extra_headers {
if let (Ok(n), Ok(val)) = (
http::header::HeaderName::from_bytes(k.as_bytes()),