feat(core): JS-hub page detector + --prefer-articles flag

Detects ESPN-style hub pages (espn.com/nba/, /nfl/, /mlb/, /nhl/, /soccer/) where the rendered markup has nav-only content with no article bodies — chrome retry doesn't help because the data genuinely isn't in the markup. Heuristic: word_count < 500 AND link_count >= 5 against the extracted output. --prefer-articles: when set, a hub-classified page returns the extracted link list (reusing the M1 --mode summary machinery) instead of the sparse body. On non-hub pages, behavior is unchanged. stderr hint: always emitted on hub detection so the caller knows to drill /story/_/id/<id>/ URLs from a citation list. False-positive resistance verified: BBC News /world (link-heavy aggregator, 1500+ words body) and n1info.rs (widget-heavy but content-rich) both classify as non-hub and emit full extraction. 9 new tests in webclaw-core (317 -> 326).
2026-06-11 22:55:13 +02:00 · 2026-05-23 18:55:17 +02:00 · 2026-05-23 18:55:17 +02:00 · 31a8f6150f
commit 31a8f6150f
parent 339f41bb7c
4 changed files with 383 additions and 4 deletions
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -180,6 +180,14 @@ struct Cli {
    #[arg(long, default_value = "0")]
    max_output_bytes: u64,

+    /// When the page is detected as a JS hub (short body + nav-style link list,
+    /// e.g. ESPN /nba /nfl /mlb /nhl /soccer), return only the extracted link
+    /// list (equivalent to --mode summary). Non-hub pages are unchanged.
+    /// A one-line stderr hint is also emitted on hub detection regardless of
+    /// this flag, so callers can react on the next invocation.
+    #[arg(long)]
+    prefer_articles: bool,
+
    /// Browser to impersonate
    #[arg(short, long, default_value = "chrome")]
    browser: Browser,
@ -1122,6 +1130,37 @@ fn print_output_with_mode(
    println!("{out}");
 }

+/// Apply iter-2 M2's hub-page detector. When a hub is detected:
+///   - emit a single stderr hint line (always — informational only),
+///   - if `prefer_articles` is on, override the OutputMode to `Summary`
+///     so the caller gets the link list directly without re-invoking.
+///
+/// Returns the effective `OutputMode` to use for emission. When no hub
+/// is detected or the result is from a non-local path (cloud), the input
+/// mode is returned unchanged and no stderr is written.
+///
+/// Designed to be additive — `prefer_articles=false` callers keep their
+/// existing stdout bytes byte-identical; the hint goes to stderr so it
+/// doesn't affect the sentinel byte-counting on p01-p15.
+fn apply_hub_detection(
+    result: &ExtractionResult,
+    requested_mode: &OutputMode,
+    prefer_articles: bool,
+) -> OutputMode {
+    let classification = webclaw_core::classify_hub(result);
+    if !classification.is_hub {
+        return requested_mode.clone();
+    }
+    // Always emit the informational hint on hub detection — stderr only.
+    eprintln!("# hint: {}", classification.hint_line());
+    if prefer_articles {
+        // Caller asked us to honor the detection: switch to summary.
+        OutputMode::Summary
+    } else {
+        requested_mode.clone()
+    }
+}
+
 /// Print cloud API response in the requested format.
 fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) {
    match format {
@ -2754,6 +2793,7 @@ async fn main() {
    // Single-page extraction (handles both HTML and PDF via content-type detection)
    match fetch_and_extract(&cli).await {
        Ok(FetchOutput::Local(result)) => {
+            let effective_mode = apply_hub_detection(&result, &cli.mode, cli.prefer_articles);
            if let Some(ref dir) = cli.output_dir {
                let url = cli
                    .urls
@ -2766,7 +2806,7 @@ async fn main() {
                    &result,
                    &cli.format,
                    cli.metadata,
-                    &cli.mode,
+                    &effective_mode,
                    cli.max_output_bytes,
                );
                if let Err(e) = write_to_file(dir, &filename, &content) {
@ -2778,7 +2818,7 @@ async fn main() {
                    &result,
                    &cli.format,
                    cli.metadata,
-                    &cli.mode,
+                    &effective_mode,
                    cli.max_output_bytes,
                );
            }