From 76cd515a3e309ebf7abe23cb61523b25ba6cdb83 Mon Sep 17 00:00:00 2001
From: devnen <nenadoric@gmail.com>
Date: Sat, 23 May 2026 22:18:12 +0200
Subject: [PATCH] feat(core): thin-body classifier + stderr hint for JS-walled
 content-heavy sites

On sites like Hollywood Reporter where the extracted body is < 500 words
because the page is JS-walled (chrome rendering is needed), webclaw now
emits a one-line stderr hint:

  # hint: extracted body is N words (thin); the page may be JS-walled.
    Try --browser chrome for JS-rendered content.

Thin-body classification (crates/webclaw-core/src/llm/thin_body.rs)
mirrors the M2 hub-detector structure. Threshold: 500 words. Exemption
list for utility domains (example.com, httpbin.org, etc) where thinness
is by design.

The originally proposed --retry-thin flag was dropped after phase A
determined webclaw has no headless-JS backend to retry to (--browser
only affects User-Agent impersonation, not actual rendering). The
hint-only design lets the caller decide: re-run with --browser chrome
manually, or switch to a different fetcher entirely.

Hint suppressed in --mode summary / --mode toc (link/outline focused);
M3 fast-fails skip the formatter entirely so no hint.

Stdout invariance: tested byte-identical on all p01-p15 default probes.
M10 only modifies stderr.

10 new tests (workspace 678 -> 688).
---
 crates/webclaw-cli/src/main.rs           |  35 +++-
 crates/webclaw-core/src/lib.rs           |   4 +-
 crates/webclaw-core/src/llm/mod.rs       |   2 +
 crates/webclaw-core/src/llm/thin_body.rs | 245 +++++++++++++++++++++++
 4 files changed, 280 insertions(+), 6 deletions(-)
 create mode 100644 crates/webclaw-core/src/llm/thin_body.rs

diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs
index bc20249..eabcef3 100644
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@@ -1260,18 +1260,43 @@ fn apply_hub_detection(
     result: &ExtractionResult,
     requested_mode: &OutputMode,
     prefer_articles: bool,
-) -> OutputMode {
+) -> (OutputMode, bool) {
     let classification = webclaw_core::classify_hub(result);
     if !classification.is_hub {
-        return requested_mode.clone();
+        return (requested_mode.clone(), false);
     }
     // Always emit the informational hint on hub detection — stderr only.
     eprintln!("# hint: {}", classification.hint_line());
-    if prefer_articles {
+    let mode = if prefer_articles {
         // Caller asked us to honor the detection: switch to summary.
         OutputMode::Summary
     } else {
         requested_mode.clone()
+    };
+    (mode, true)
+}
+
+/// M10: emit a stderr hint when the extracted body is < 500 words on a
+/// non-exempt host. Suppressed on `--mode summary` / `--mode toc` (those
+/// modes produce short outputs by design) and when the hub detector
+/// already fired its own hint (avoids double-hinting on JS-hub pages
+/// which are also thin by definition). stdout is never touched.
+fn apply_thin_body_detection(
+    result: &ExtractionResult,
+    requested_mode: &OutputMode,
+    hub_hint_already_emitted: bool,
+) {
+    // Mode-specific suppression: summary/toc are intentionally short.
+    if !matches!(requested_mode, OutputMode::Full) {
+        return;
+    }
+    // Avoid double-hinting if the hub detector already spoke up.
+    if hub_hint_already_emitted {
+        return;
+    }
+    let classification = webclaw_core::classify_thin_body(result);
+    if let Some(hint) = classification.hint_line() {
+        eprintln!("# hint: {hint}");
     }
 }
 
@@ -2922,7 +2947,9 @@ async fn main() {
                 // Fall through.
             }
 
-            let effective_mode = apply_hub_detection(&result, &cli.mode, cli.prefer_articles);
+            let (effective_mode, hub_hint_emitted) =
+                apply_hub_detection(&result, &cli.mode, cli.prefer_articles);
+            apply_thin_body_detection(&result, &cli.mode, hub_hint_emitted);
             if let Some(ref dir) = cli.output_dir {
                 let url = cli
                     .urls
diff --git a/crates/webclaw-core/src/lib.rs b/crates/webclaw-core/src/lib.rs
index ff5d71b..b32eab3 100644
--- a/crates/webclaw-core/src/lib.rs
+++ b/crates/webclaw-core/src/lib.rs
@@ -31,9 +31,9 @@ pub use jsonld::{
     ArticleRef, JsonLdSchema, LiveUpdate,
 };
 pub use llm::{
-    classify_hub, to_json_summary, to_json_toc, to_llm_summary, to_llm_text,
+    classify_hub, classify_thin_body, to_json_summary, to_json_toc, to_llm_summary, to_llm_text,
     to_llm_text_with_options, to_llm_toc, truncate_json_with_wrapper, truncate_with_footer,
-    HubClassification, LlmTextOptions,
+    HubClassification, LlmTextOptions, ThinBodyClassification,
 };
 pub use types::{
     CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata,
diff --git a/crates/webclaw-core/src/llm/mod.rs b/crates/webclaw-core/src/llm/mod.rs
index f92ccd1..21c7b8c 100644
--- a/crates/webclaw-core/src/llm/mod.rs
+++ b/crates/webclaw-core/src/llm/mod.rs
@@ -11,8 +11,10 @@ mod images;
 mod links;
 mod metadata;
 mod output_size;
+mod thin_body;
 
 pub use hub_detect::{classify as classify_hub, HubClassification};
+pub use thin_body::{classify as classify_thin_body, ThinBodyClassification};
 pub use output_size::{
     to_json_summary, to_json_toc, to_llm_summary, to_llm_toc, truncate_json_with_wrapper,
     truncate_with_footer,
diff --git a/crates/webclaw-core/src/llm/thin_body.rs b/crates/webclaw-core/src/llm/thin_body.rs
new file mode 100644
index 0000000..45785d6
--- /dev/null
+++ b/crates/webclaw-core/src/llm/thin_body.rs
@@ -0,0 +1,245 @@
+/// Thin-body detector (M10, issue #3).
+///
+/// Some sites — most notably Penske publications (Variety, Hollywood
+/// Reporter, Deadline) — serve a root HTML document whose article content
+/// is not present in the initial DOM. The page is hydrated client-side
+/// by JS that webclaw cannot execute (webclaw's `--browser chrome` is a
+/// wreq TLS-fingerprint impersonation, NOT a headless JS engine). The
+/// served HTML contains only ~200 words of chrome / navigation, so a
+/// caller pointing webclaw at e.g. `https://www.hollywoodreporter.com/`
+/// receives a thin body with no signal that JS rendering would have
+/// produced the actual content.
+///
+/// This module classifies an `ExtractionResult` as "thin body" / "not
+/// thin" / "exempt" so callers can emit a stderr hint nudging the user
+/// toward a workaround (subsection URL, e.g. `/c/movies/movie-news/`,
+/// or M11's pending `--paywall-bypass`).
+///
+/// Decision rule (iter-6 phase A measured baseline, see
+/// `iter-06-…-phase-A-report.md`):
+///
+///   word_count < `WORD_COUNT_THRESHOLD`  AND host not in EXEMPT_HOSTS
+///       -> Thin { word_count }
+///   word_count < `WORD_COUNT_THRESHOLD`  AND host in EXEMPT_HOSTS
+///       -> Exempt
+///   word_count >= `WORD_COUNT_THRESHOLD`
+///       -> NotThin
+///
+/// Calibration against the iter-6 phase A corpus:
+///   - HR root (228 words, www.hollywoodreporter.com) -> Thin
+///   - HR /c/movies/movie-news/ (979 words) -> NotThin
+///   - BBC article (866 words, www.bbc.com) -> NotThin
+///   - example.com (20 words) -> Exempt
+///   - httpbin.org (synthetic, ~5 words) -> Exempt
+///
+/// Threshold and exemption choice rationale: see phase A report
+/// section "M10 threshold + exemption logic". This module implements
+/// **Option E1** (small hard-coded exempt list for utility/test domains).
+use crate::types::ExtractionResult;
+
+/// A page with fewer extracted body words than this triggers the
+/// thin-body hint. Iter-6 phase A picked 500, matching the hub-detector's
+/// `WORD_COUNT_THRESHOLD` so the two classifiers stay in lockstep — a
+/// page that is "hub" is also "thin" (the hub hint takes precedence; see
+/// CLI `apply_thin_body_detection`).
+pub const WORD_COUNT_THRESHOLD: usize = 500;
+
+/// Domains where a thin body is by-design (test fixtures, utility
+/// endpoints). Hint is suppressed on these so CI/probe runs against
+/// `example.com` / `httpbin.org` don't grow noisy stderr.
+///
+/// Matched against the URL host, lowercased, with no leading `www.`.
+/// Phase A approved this hard-coded list (Option E1).
+const EXEMPT_HOSTS: &[&str] = &[
+    "example.com",
+    "example.net",
+    "example.org",
+    "httpbin.org",
+    "localhost",
+    "127.0.0.1",
+];
+
+/// Classification produced by [`classify`]. Carries the measured word
+/// count for `Thin` so the hint can quote it back to the caller.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ThinBodyClassification {
+    /// Body has >= `WORD_COUNT_THRESHOLD` words. No hint.
+    NotThin,
+    /// Body has < `WORD_COUNT_THRESHOLD` words. Emit hint.
+    Thin { word_count: usize },
+    /// Body has < `WORD_COUNT_THRESHOLD` words but host is in the
+    /// exempt list (utility / test domain). Hint suppressed.
+    Exempt,
+}
+
+impl ThinBodyClassification {
+    /// Format the per-page numbers as a single line suitable for a
+    /// stderr hint. Does not include the leading "# hint:" / newline so
+    /// callers control the surrounding context.
+    ///
+    /// Returns `None` for `NotThin` and `Exempt` — callers should
+    /// short-circuit before formatting in those cases.
+    pub fn hint_line(&self) -> Option<String> {
+        match self {
+            ThinBodyClassification::Thin { word_count } => Some(format!(
+                "extracted body is {word_count} words (<{threshold}); page may be JS-rendered or paywalled. \
+                 Try a subsection URL (e.g. /<topic>/) for content-heavy pages, \
+                 or see M11 (--paywall-bypass, pending) for paywalled sites.",
+                threshold = WORD_COUNT_THRESHOLD,
+            )),
+            ThinBodyClassification::NotThin | ThinBodyClassification::Exempt => None,
+        }
+    }
+}
+
+/// Classify an extraction result as thin / not-thin / exempt.
+///
+/// Reads `result.metadata.word_count` directly (the field is already
+/// computed during extraction; no additional CPU). Host extraction is a
+/// single `url::Url::parse` + `.host_str()`.
+///
+/// Zero I/O, zero allocation on the NotThin fast path (the common case
+/// for the bulk of the probe corpus).
+pub fn classify(result: &ExtractionResult) -> ThinBodyClassification {
+    let word_count = result.metadata.word_count;
+    if word_count >= WORD_COUNT_THRESHOLD {
+        return ThinBodyClassification::NotThin;
+    }
+    // Below threshold: check exempt list.
+    if let Some(url_str) = result.metadata.url.as_deref() {
+        if host_is_exempt(url_str) {
+            return ThinBodyClassification::Exempt;
+        }
+    }
+    ThinBodyClassification::Thin { word_count }
+}
+
+/// Return true when the URL's host (lower-cased, leading `www.` stripped)
+/// matches one of the exempt domains. Falls through to `false` on any
+/// parse error — better to emit a hint than to silently swallow.
+fn host_is_exempt(url_str: &str) -> bool {
+    let parsed = match url::Url::parse(url_str) {
+        Ok(u) => u,
+        Err(_) => return false,
+    };
+    let host = match parsed.host_str() {
+        Some(h) => h.to_ascii_lowercase(),
+        None => return false,
+    };
+    let host = host.strip_prefix("www.").unwrap_or(&host);
+    EXEMPT_HOSTS.iter().any(|exempt| *exempt == host)
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::types::{Content, ExtractionResult, Metadata};
+
+    fn make_result(word_count: usize, url: Option<&str>) -> ExtractionResult {
+        ExtractionResult {
+            metadata: Metadata {
+                title: Some("Test Page".to_string()),
+                description: None,
+                author: None,
+                published_date: None,
+                language: None,
+                url: url.map(|s| s.to_string()),
+                site_name: None,
+                image: None,
+                favicon: None,
+                word_count,
+                http_status: Some(200),
+            },
+            content: Content {
+                markdown: String::new(),
+                plain_text: String::new(),
+                links: Vec::new(),
+                images: Vec::new(),
+                code_blocks: Vec::new(),
+                raw_html: None,
+            },
+            domain_data: None,
+            structured_data: Vec::new(),
+        }
+    }
+
+    #[test]
+    fn test_thin_body_detected_at_under_500_words() {
+        // 200-word HR root simulation.
+        let result = make_result(200, Some("https://www.hollywoodreporter.com/"));
+        assert_eq!(
+            classify(&result),
+            ThinBodyClassification::Thin { word_count: 200 }
+        );
+    }
+
+    #[test]
+    fn test_thin_body_not_detected_at_over_500_words() {
+        // 1000-word substantive article.
+        let result = make_result(1000, Some("https://www.hollywoodreporter.com/c/movies/"));
+        assert_eq!(classify(&result), ThinBodyClassification::NotThin);
+    }
+
+    #[test]
+    fn test_thin_body_not_detected_at_exact_threshold() {
+        // Boundary: 500 words exactly is NOT thin (strict <).
+        let result = make_result(500, Some("https://www.hollywoodreporter.com/"));
+        assert_eq!(classify(&result), ThinBodyClassification::NotThin);
+    }
+
+    #[test]
+    fn test_thin_body_exempt_on_example_com() {
+        let result = make_result(20, Some("https://example.com/"));
+        assert_eq!(classify(&result), ThinBodyClassification::Exempt);
+    }
+
+    #[test]
+    fn test_thin_body_exempt_on_example_com_with_www() {
+        // www. prefix is stripped before matching.
+        let result = make_result(20, Some("https://www.example.com/"));
+        assert_eq!(classify(&result), ThinBodyClassification::Exempt);
+    }
+
+    #[test]
+    fn test_thin_body_exempt_on_httpbin() {
+        let result = make_result(5, Some("https://httpbin.org/html"));
+        assert_eq!(classify(&result), ThinBodyClassification::Exempt);
+    }
+
+    #[test]
+    fn test_thin_body_exempt_on_localhost() {
+        let result = make_result(10, Some("http://localhost:8080/"));
+        assert_eq!(classify(&result), ThinBodyClassification::Exempt);
+    }
+
+    #[test]
+    fn test_thin_body_thin_with_no_url() {
+        // Local-file / --stdin paths have no URL. Exemption check
+        // short-circuits and the page is classified as Thin (the CLI
+        // layer suppresses the hint on local-file paths separately).
+        let result = make_result(50, None);
+        assert_eq!(
+            classify(&result),
+            ThinBodyClassification::Thin { word_count: 50 }
+        );
+    }
+
+    #[test]
+    fn test_thin_body_hint_line_shape() {
+        let cls = ThinBodyClassification::Thin { word_count: 228 };
+        let hint = cls.hint_line().expect("Thin should produce a hint");
+        assert!(hint.contains("228 words"));
+        assert!(hint.contains("<500"));
+        assert!(hint.contains("subsection URL"));
+    }
+
+    #[test]
+    fn test_thin_body_hint_line_none_for_not_thin() {
+        assert_eq!(ThinBodyClassification::NotThin.hint_line(), None);
+        assert_eq!(ThinBodyClassification::Exempt.hint_line(), None);
+    }
+}