From 06f065cb08dbe24a5b5abd612bfccd96728f7b6e Mon Sep 17 00:00:00 2001 From: devnen Date: Sun, 24 May 2026 00:24:51 +0200 Subject: [PATCH] feat(fetch): periodic progress stderr line on slow fetches Webclaw's default -t timeout is 30s; slow sites previously sat silently with no feedback. Now during a fetch, every 10s of elapsed time webclaw writes one line to stderr: # webclaw: still fetching (Ns) Fetches completing in under 10s emit nothing (the timer never fires). Stdout output is untouched - pure feedback signal on stderr. No timeout change. No new flags. Default behavior is augmented at stderr only. Implemented via tokio::select! between the fetch future and a tokio::time::interval. Latency cost: a single tokio task spawn and a 10s tick - microseconds on the fast path. 10 new tests in webclaw-fetch::progress::tests (none ignored; the slow-future test uses a 50ms test interval to keep cargo test fast). Workspace total 710 -> 720. --- crates/webclaw-cli/src/main.rs | 6 +- crates/webclaw-fetch/src/lib.rs | 2 + crates/webclaw-fetch/src/progress.rs | 292 +++++++++++++++++++++++++++ 3 files changed, 299 insertions(+), 1 deletion(-) create mode 100644 crates/webclaw-fetch/src/progress.rs diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index c07e1fc..ba5746d 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -1023,7 +1023,11 @@ async fn fetch_and_extract(cli: &Cli) -> Result { let client = FetchClient::new(build_fetch_config(cli)).map_err(|e| format!("client error: {e}"))?; let options = build_extraction_options(cli); - let result = match client.fetch_and_extract_with_options(url, &options).await { + // M13: wrap with periodic stderr progress emitter. Fast fetches see + // zero emissions (timer never fires in <10s); slow fetches get a + // line every 10s of elapsed time so the CLI doesn't appear hung. + let fetch_fut = client.fetch_and_extract_with_options(url, &options); + let result = match webclaw_fetch::with_progress(url, fetch_fut).await { Ok(r) => r, // M3: known-bad-sites registry hit. The error message is already // formatted per phase-A contract. Emit it to stderr verbatim and diff --git a/crates/webclaw-fetch/src/lib.rs b/crates/webclaw-fetch/src/lib.rs index febe908..6cee844 100644 --- a/crates/webclaw-fetch/src/lib.rs +++ b/crates/webclaw-fetch/src/lib.rs @@ -12,6 +12,7 @@ pub mod fetcher; pub mod known_bad_sites; pub mod linkedin; pub mod locale; +pub mod progress; pub mod proxy; pub mod reddit; pub mod sitemap; @@ -25,6 +26,7 @@ pub use error::FetchError; pub use fetcher::Fetcher; pub use http::HeaderMap; pub use locale::{accept_language_for_tld, accept_language_for_url}; +pub use progress::{with_progress, PROGRESS_INTERVAL}; pub use proxy::{parse_proxy_file, parse_proxy_line}; pub use sitemap::SitemapEntry; pub use webclaw_pdf::PdfMode; diff --git a/crates/webclaw-fetch/src/progress.rs b/crates/webclaw-fetch/src/progress.rs new file mode 100644 index 0000000..223f66d --- /dev/null +++ b/crates/webclaw-fetch/src/progress.rs @@ -0,0 +1,292 @@ +//! Periodic stderr progress line emitter for slow fetches (M13). +//! +//! Wraps any async fetch future with a `tokio::select!` against a +//! `tokio::time::interval`. Every `PROGRESS_INTERVAL` (default 10s) of +//! elapsed time, emits one line to STDERR of the form: +//! +//! ```text +//! # webclaw: still fetching (Ns) +//! ``` +//! +//! Fetches completing in under `PROGRESS_INTERVAL` emit zero lines (the +//! timer never fires). Stdout is untouched. +//! +//! The URL is truncated to at most 80 chars (head + `...` + tail) so +//! pathological query strings don't blow up the stderr line. Truncation +//! is char-boundary safe (operates on `chars`, not bytes). + +use std::future::Future; +use std::time::Duration; + +use tokio::time::{interval, Instant, MissedTickBehavior}; + +/// Default progress emission interval. The first tick fires at +10s +/// elapsed; subsequent ticks at +20s, +30s, etc. +pub const PROGRESS_INTERVAL: Duration = Duration::from_secs(10); + +/// Maximum URL length in the progress line. Longer URLs are truncated +/// `head...tail` style. +const MAX_URL_LEN: usize = 80; + +/// Wrap a fetch future with the default 10s progress emitter. Writes +/// progress lines to STDERR via `eprintln!`. Returns the inner future's +/// result unchanged. +pub async fn with_progress(url: &str, future: F) -> T +where + F: Future, +{ + with_progress_writer(url, future, PROGRESS_INTERVAL, |s| eprintln!("{s}")).await +} + +/// Test-friendly variant of [`with_progress`]: caller supplies the tick +/// interval (so tests can use a 50ms period instead of 10s) and a +/// writer closure (so tests can capture emitted lines without touching +/// real stderr). +/// +/// Production code uses [`with_progress`] which delegates here with +/// [`PROGRESS_INTERVAL`] and an `eprintln!` writer. +pub async fn with_progress_writer( + url: &str, + future: F, + period: Duration, + mut writer: W, +) -> T +where + F: Future, + W: FnMut(String), +{ + let start = Instant::now(); + let mut ticker = interval(period); + // First tick of `tokio::time::interval(period)` fires *immediately* + // (at construction time). We don't want a t=0 emit — consume that + // first tick before entering the select loop. Subsequent ticks fire + // at `start + period`, `start + 2*period`, ... + ticker.set_missed_tick_behavior(MissedTickBehavior::Skip); + ticker.tick().await; + + tokio::pin!(future); + + loop { + tokio::select! { + // Bias toward the future — if both are ready (rare), prefer + // returning the result over emitting a final tick. + biased; + result = &mut future => { + return result; + } + _ = ticker.tick() => { + let elapsed = start.elapsed(); + writer(format_progress_line(url, elapsed)); + } + } + } +} + +/// Build the progress line: `# webclaw: still fetching (Ns)`. +/// URL is truncated via [`truncate_url`] to [`MAX_URL_LEN`] chars. +/// Elapsed is rounded to whole seconds (10, 20, 30, ...). +pub(crate) fn format_progress_line(url: &str, elapsed: Duration) -> String { + let truncated = truncate_url(url, MAX_URL_LEN); + let secs = elapsed.as_secs(); + format!("# webclaw: still fetching {truncated} ({secs}s)") +} + +/// Truncate `url` to at most `max` chars, using `head...tail` shape +/// when truncation is needed. Char-boundary safe (operates on `chars`). +pub(crate) fn truncate_url(url: &str, max: usize) -> String { + let total_chars = url.chars().count(); + if total_chars <= max { + return url.to_string(); + } + // Reserve 3 chars for "..." and split the remainder ~70/30 between + // head (path-side) and tail (query-side). + let avail = max.saturating_sub(3); + let head_chars = avail.saturating_sub(17); + let tail_chars = 17; + let head: String = url.chars().take(head_chars).collect(); + let tail: String = url + .chars() + .rev() + .take(tail_chars) + .collect::>() + .into_iter() + .rev() + .collect(); + format!("{head}...{tail}") +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::{Arc, Mutex}; + + /// Collect emitted lines into a `Vec` via a captured writer. + fn capture() -> (Arc>>, impl FnMut(String)) { + let sink: Arc>> = Arc::new(Mutex::new(Vec::new())); + let sink_clone = Arc::clone(&sink); + let writer = move |s: String| { + sink_clone.lock().unwrap().push(s); + }; + (sink, writer) + } + + #[tokio::test] + async fn test_progress_emits_after_interval_elapsed() { + let (sink, writer) = capture(); + // 250ms future, 50ms interval — expect ~4-5 ticks before resolution. + let fut = tokio::time::sleep(Duration::from_millis(250)); + with_progress_writer( + "https://example.com/slow", + async { + fut.await; + 42_i32 + }, + Duration::from_millis(50), + writer, + ) + .await; + let lines = sink.lock().unwrap(); + assert!( + !lines.is_empty(), + "expected >=1 progress line; got {} ({:?})", + lines.len(), + *lines + ); + for line in lines.iter() { + assert!( + line.starts_with("# webclaw: still fetching"), + "line shape wrong: {line:?}" + ); + assert!( + line.contains("https://example.com/slow"), + "url missing from line: {line:?}" + ); + } + } + + #[tokio::test] + async fn test_progress_silent_on_fast_future() { + let (sink, writer) = capture(); + // 10ms future, 1s interval — zero ticks expected. + let result = with_progress_writer( + "https://example.com/fast", + async { + tokio::time::sleep(Duration::from_millis(10)).await; + "done" + }, + Duration::from_secs(1), + writer, + ) + .await; + assert_eq!(result, "done"); + let lines = sink.lock().unwrap(); + assert_eq!( + lines.len(), + 0, + "expected 0 progress lines on fast future; got {:?}", + *lines + ); + } + + #[tokio::test] + async fn test_progress_line_includes_url() { + let (sink, writer) = capture(); + let target_url = "https://news.ycombinator.com/item?id=12345"; + with_progress_writer( + target_url, + async { + tokio::time::sleep(Duration::from_millis(150)).await; + }, + Duration::from_millis(50), + writer, + ) + .await; + let lines = sink.lock().unwrap(); + assert!(!lines.is_empty(), "expected progress lines"); + assert!( + lines.iter().all(|l| l.contains(target_url)), + "every line should contain the URL: {:?}", + *lines + ); + } + + #[tokio::test] + async fn test_progress_returns_inner_result_ok() { + let (_sink, writer) = capture(); + let r: Result = with_progress_writer( + "https://example.com/", + async { Ok::(7) }, + Duration::from_secs(1), + writer, + ) + .await; + assert_eq!(r, Ok(7)); + } + + #[tokio::test] + async fn test_progress_propagates_error() { + let (_sink, writer) = capture(); + let r: Result = with_progress_writer( + "https://example.com/", + async { Err::("boom".to_string()) }, + Duration::from_secs(1), + writer, + ) + .await; + assert_eq!(r, Err("boom".to_string())); + } + + #[test] + fn test_truncate_url_short_passthrough() { + let url = "https://example.com/"; + assert_eq!(truncate_url(url, 80), url); + } + + #[test] + fn test_truncate_url_long_head_dots_tail() { + let url = "https://www.example.com/very/long/path/segments/with/lots/of/text/and/then?q=some_long_query_string_value_here&other=more&another=thing"; + let truncated = truncate_url(url, 80); + assert!( + truncated.chars().count() <= 80, + "truncated length {} > 80: {truncated:?}", + truncated.chars().count() + ); + assert!( + truncated.contains("..."), + "expected '...' marker in truncated url: {truncated:?}" + ); + assert!( + truncated.starts_with("https://www.example.com/"), + "truncated should start with the URL head: {truncated:?}" + ); + } + + #[test] + fn test_truncate_url_unicode_safe() { + // Cyrillic URL longer than 80 chars — must not panic on a + // mid-codepoint split. + let url = "https://example.com/путь/к/очень/длинной/странице/с/большим/количеством/кириллицы/тут"; + let truncated = truncate_url(url, 80); + assert!(truncated.is_char_boundary(truncated.len())); + // Roundtrip through chars to confirm valid UTF-8 throughout. + let _: String = truncated.chars().collect(); + } + + #[test] + fn test_format_progress_line_shape() { + let line = format_progress_line("https://example.com/", Duration::from_secs(10)); + assert_eq!( + line, + "# webclaw: still fetching https://example.com/ (10s)" + ); + } + + #[test] + fn test_format_progress_line_seconds_only() { + // Sub-second elapsed rounds to 0s, not fractions. (In practice + // the first tick fires at +PROGRESS_INTERVAL so this is mostly + // a defensive shape assertion.) + let line = format_progress_line("https://x/", Duration::from_millis(9_500)); + assert!(line.ends_with("(9s)"), "line should end with `(9s)`: {line:?}"); + } +}