feat(core): JS-hub page detector + --prefer-articles flag

Detects ESPN-style hub pages (espn.com/nba/, /nfl/, /mlb/, /nhl/, /soccer/)
where the rendered markup has nav-only content with no article bodies —
chrome retry doesn't help because the data genuinely isn't in the markup.
Heuristic: word_count < 500 AND link_count >= 5 against the extracted output.

--prefer-articles: when set, a hub-classified page returns the extracted
link list (reusing the M1 --mode summary machinery) instead of the sparse
body. On non-hub pages, behavior is unchanged.

stderr hint: always emitted on hub detection so the caller knows to drill
/story/_/id/<id>/ URLs from a citation list.

False-positive resistance verified: BBC News /world (link-heavy aggregator,
1500+ words body) and n1info.rs (widget-heavy but content-rich) both
classify as non-hub and emit full extraction.

9 new tests in webclaw-core (317 -> 326).
This commit is contained in:
devnen 2026-05-23 18:55:17 +02:00
parent 339f41bb7c
commit 31a8f6150f
4 changed files with 383 additions and 4 deletions

View file

@ -180,6 +180,14 @@ struct Cli {
#[arg(long, default_value = "0")]
max_output_bytes: u64,
/// When the page is detected as a JS hub (short body + nav-style link list,
/// e.g. ESPN /nba /nfl /mlb /nhl /soccer), return only the extracted link
/// list (equivalent to --mode summary). Non-hub pages are unchanged.
/// A one-line stderr hint is also emitted on hub detection regardless of
/// this flag, so callers can react on the next invocation.
#[arg(long)]
prefer_articles: bool,
/// Browser to impersonate
#[arg(short, long, default_value = "chrome")]
browser: Browser,
@ -1122,6 +1130,37 @@ fn print_output_with_mode(
println!("{out}");
}
/// Apply iter-2 M2's hub-page detector. When a hub is detected:
/// - emit a single stderr hint line (always — informational only),
/// - if `prefer_articles` is on, override the OutputMode to `Summary`
/// so the caller gets the link list directly without re-invoking.
///
/// Returns the effective `OutputMode` to use for emission. When no hub
/// is detected or the result is from a non-local path (cloud), the input
/// mode is returned unchanged and no stderr is written.
///
/// Designed to be additive — `prefer_articles=false` callers keep their
/// existing stdout bytes byte-identical; the hint goes to stderr so it
/// doesn't affect the sentinel byte-counting on p01-p15.
fn apply_hub_detection(
result: &ExtractionResult,
requested_mode: &OutputMode,
prefer_articles: bool,
) -> OutputMode {
let classification = webclaw_core::classify_hub(result);
if !classification.is_hub {
return requested_mode.clone();
}
// Always emit the informational hint on hub detection — stderr only.
eprintln!("# hint: {}", classification.hint_line());
if prefer_articles {
// Caller asked us to honor the detection: switch to summary.
OutputMode::Summary
} else {
requested_mode.clone()
}
}
/// Print cloud API response in the requested format.
fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) {
match format {
@ -2754,6 +2793,7 @@ async fn main() {
// Single-page extraction (handles both HTML and PDF via content-type detection)
match fetch_and_extract(&cli).await {
Ok(FetchOutput::Local(result)) => {
let effective_mode = apply_hub_detection(&result, &cli.mode, cli.prefer_articles);
if let Some(ref dir) = cli.output_dir {
let url = cli
.urls
@ -2766,7 +2806,7 @@ async fn main() {
&result,
&cli.format,
cli.metadata,
&cli.mode,
&effective_mode,
cli.max_output_bytes,
);
if let Err(e) = write_to_file(dir, &filename, &content) {
@ -2778,7 +2818,7 @@ async fn main() {
&result,
&cli.format,
cli.metadata,
&cli.mode,
&effective_mode,
cli.max_output_bytes,
);
}

View file

@ -26,8 +26,8 @@ pub use diff::{ChangeStatus, ContentDiff, MetadataChange};
pub use domain::DomainType;
pub use error::ExtractError;
pub use llm::{
to_json_summary, to_json_toc, to_llm_summary, to_llm_text, to_llm_toc,
truncate_json_with_wrapper, truncate_with_footer,
classify_hub, to_json_summary, to_json_toc, to_llm_summary, to_llm_text, to_llm_toc,
truncate_json_with_wrapper, truncate_with_footer, HubClassification,
};
pub use types::{
CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata,

View file

@ -0,0 +1,337 @@
/// JS-hub page detector.
///
/// Some sites (notably ESPN /nba /nfl /mlb /nhl /soccer) render most of
/// their content via JavaScript and ship a thin "nav-card hub" in the
/// initial HTML: short body, a small set of nav-style links, and no real
/// article prose. Chrome retry does not help — the article body genuinely
/// isn't in the rendered DOM; it lives behind further JS API calls under
/// `/story/_/id/<id>/...` URLs.
///
/// This module classifies an `ExtractionResult` as "hub" / "not hub" so
/// callers can either emit a stderr hint or honor `--prefer-articles`
/// and return just the extracted link list.
///
/// Heuristic (iter-2 phase A measured baseline, see
/// `baselines/probe-run-r-iter2-baseline.json`):
///
/// is_hub = (word_count < `WORD_COUNT_THRESHOLD`)
/// AND (link_count >= `MIN_LINK_COUNT`)
///
/// Calibration against the iter-0 corpus + iter-2 hub probes:
/// - ESPN /nba (288 words, 7 links) -> HUB
/// - ESPN /nfl (304 words, 7 links) -> HUB
/// - ESPN root (330 words, 7 links) -> HUB (borderline accepted)
/// - BBC /news/world (1981 words, 28 links) -> NOT hub (word_count too high)
/// - n1info root (3015 words, 134 links) -> NOT hub (word_count too high)
/// - THR root (209 words, 1 link) -> NOT hub (link_count too low)
/// - Reuters ME broken-fetch (21 words, 0 links) -> NOT hub
/// - synthetic url-escape (85 words, 0 links) -> NOT hub
///
/// 8 / 8 correct with comfortable margins on both sides.
use crate::types::ExtractionResult;
use super::body;
use super::links;
/// A page with fewer words than this is a candidate hub (gated by
/// `MIN_LINK_COUNT`). Iter-2 phase A picked 500 to give a >3.9x gap above
/// the lowest aggregator word count seen in the corpus (BBC /news/world =
/// 1981 words).
pub const WORD_COUNT_THRESHOLD: usize = 500;
/// A candidate hub must also have at least this many links — excludes
/// broken / thin-body / synthetic cases that look short but aren't hubs.
/// Iter-2 phase A picked 5 with the lowest observed hub link_count of 7
/// for safety margin.
pub const MIN_LINK_COUNT: usize = 5;
/// Result of classifying an `ExtractionResult`. Includes the raw signals
/// used so callers can emit a useful stderr hint.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct HubClassification {
pub is_hub: bool,
pub word_count: usize,
pub link_count: usize,
}
impl HubClassification {
/// Format the per-page numbers as a single line suitable for a
/// stderr hint. Does not include the leading "# hint:" / newline so
/// callers control the surrounding context.
pub fn hint_line(&self) -> String {
format!(
"this page looks like a JS hub (word_count={}, link_count={}). \
The article body is likely not in the rendered DOM drill /story/_/id/<id>/ \
or similar article URLs for content. \
Use --prefer-articles to return the extracted link list directly.",
self.word_count, self.link_count
)
}
}
/// Classify an extraction result as hub / not-hub.
///
/// Operates on the same processed-body pipeline used by the main LLM
/// formatter and `to_llm_summary` so the link count matches what the
/// caller will see if they switch to `--prefer-articles`.
pub fn classify(result: &ExtractionResult) -> HubClassification {
let word_count = count_body_words(result);
let link_count = count_clean_links(result);
let is_hub = word_count < WORD_COUNT_THRESHOLD && link_count >= MIN_LINK_COUNT;
HubClassification {
is_hub,
word_count,
link_count,
}
}
/// Count words in the *body* text after the body pipeline (which strips
/// chrome / nav / dedup'd repeats). We deliberately don't trust
/// `result.metadata.word_count` because that comes from the raw plain
/// text — chrome-inclusive — and would over-count hub pages.
fn count_body_words(result: &ExtractionResult) -> usize {
let processed = body::process_body(&result.content.markdown);
processed
.text
.split_whitespace()
.filter(|w| !w.is_empty())
.count()
}
/// Count emitted links after the same noise filter the main LLM
/// formatter uses. Mirrors `to_llm_summary`'s collection so detector
/// output matches what `--prefer-articles` will print.
fn count_clean_links(result: &ExtractionResult) -> usize {
let processed = body::process_body(&result.content.markdown);
let mut n = 0usize;
for (text, _href) in processed.links {
let label = links::clean_link_label(&text);
if label.is_empty() {
continue;
}
n += 1;
}
n
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
use crate::types::{Content, ExtractionResult, Metadata};
fn make_result(markdown: &str) -> ExtractionResult {
ExtractionResult {
metadata: Metadata {
title: Some("Test Page".to_string()),
description: None,
author: None,
published_date: None,
language: None,
url: Some("https://example.com/".to_string()),
site_name: None,
image: None,
favicon: None,
word_count: 0,
},
content: Content {
markdown: markdown.to_string(),
plain_text: String::new(),
links: Vec::new(),
images: Vec::new(),
code_blocks: Vec::new(),
raw_html: None,
},
domain_data: None,
structured_data: Vec::new(),
}
}
/// Build a markdown body with `n_links` link lines and approximately
/// `n_body_words` body words. Each "sentence" is given a unique
/// numeric stamp so the body-processing pipeline's dedup steps don't
/// collapse repeating sentences. Mirrors how webclaw emits a real
/// page: prose body + a link list.
fn synth_hub(n_links: usize, n_body_words: usize) -> String {
// Each base sentence is ~14 words. We tag each sentence with a
// unique counter so dedup_lines / dedup_content_blocks /
// dedup_repeated_phrases never see two identical lines.
let base_sentences = [
"The proposed amendment would require ratification by at least three quarters of the member legislatures",
"Investigators say the malfunction was traced to a faulty heat exchanger in the secondary loop",
"Critics argue that the policy would disproportionately burden small businesses already operating on thin margins",
"Researchers documented behavioral changes in juvenile salmon exposed to elevated water temperatures over time",
"The committee voted unanimously to defer the matter pending further independent technical review next quarter",
"Lawyers for the defendant filed a motion seeking dismissal on procedural grounds before trial began",
"Survey respondents reported declining confidence in the long term solvency of the pension fund balance",
"Officials confirmed that the planned shutdown would last approximately seventy two hours barring complications",
"Analysts noted that quarterly revenue exceeded internal projections despite weakness in two regional markets",
"Volunteers spent the weekend clearing debris and restoring access along the lower river trail",
];
let words_per_sentence = 15; // 14 base + 1 unique stamp
let n_sentences = n_body_words.div_ceil(words_per_sentence);
let mut md = String::from("# Synthetic Hub Page\n\n");
for i in 0..n_sentences {
// Stamp goes BEFORE the base sentence so the first
// DEDUP_PREFIX_WORDS (10) leading words differ across cycles
// and the body pipeline's near-duplicate prefix detector
// doesn't collapse our cycling base sentences.
md.push_str(&format!("Item {i}: "));
md.push_str(base_sentences[i % base_sentences.len()]);
md.push_str(".\n\n");
}
md.push_str("## Links\n\n");
for i in 0..n_links {
md.push_str(&format!(
"- [Story headline {i}](https://example.com/story/{i})\n"
));
}
md
}
// ----- detector recognizes hub-shaped pages -----
/// p35-equivalent: ESPN /nba shape. Phase A measured 288 words, 7 links.
/// Use 30 links + 200 body words per phase B brief (closer to the
/// synthetic fixture spec than the live measurement).
#[test]
fn test_hub_detector_recognizes_espn_nba() {
let md = synth_hub(30, 200);
let r = make_result(&md);
let c = classify(&r);
assert!(c.is_hub, "expected hub; got {c:?}");
assert!(c.word_count < WORD_COUNT_THRESHOLD, "words {} >= threshold", c.word_count);
assert!(c.link_count >= MIN_LINK_COUNT, "links {} < min", c.link_count);
}
/// p36-equivalent: ESPN /nfl shape. Slightly different but still
/// hub-like ratios — fewer links, slightly more body.
#[test]
fn test_hub_detector_recognizes_espn_nfl() {
let md = synth_hub(7, 304);
let r = make_result(&md);
let c = classify(&r);
assert!(c.is_hub, "expected hub; got {c:?}");
}
/// p38-equivalent: aggregator with real body — many links but
/// thousands of words of prose. Phase A: BBC /news/world = 1981 words
/// 28 links. Detector must NOT classify as hub.
#[test]
fn test_hub_detector_passes_aggregator_with_real_body() {
let md = synth_hub(100, 1500);
let r = make_result(&md);
let c = classify(&r);
assert!(
!c.is_hub,
"false positive on link-heavy but content-rich page; got {c:?}"
);
assert!(c.word_count >= WORD_COUNT_THRESHOLD);
}
/// Normal long article — few links, lots of prose. Common case;
/// must NOT classify as hub. We use a much larger body target so
/// the body pipeline's dedup steps still leave us well above the
/// 500-word threshold.
#[test]
fn test_hub_detector_passes_normal_article() {
// Aim for ~2400 raw words so post-dedup body stays >500.
let md = synth_hub(5, 2400);
let r = make_result(&md);
let c = classify(&r);
assert!(
!c.is_hub,
"false positive on normal article; got {c:?} (threshold {})",
WORD_COUNT_THRESHOLD
);
assert!(c.word_count >= WORD_COUNT_THRESHOLD);
}
/// Cross-reference iter-0 corpus: THR-style thin-body page (low words,
/// 1 link). Must NOT classify as hub — chrome retry is the right fix
/// for THR per issue #3 / M10, not hub detection.
#[test]
fn test_hub_detector_excludes_thin_body_thr_shape() {
let md = synth_hub(1, 209);
let r = make_result(&md);
let c = classify(&r);
assert!(!c.is_hub, "thin-body misclassified as hub; got {c:?}");
assert!(c.link_count < MIN_LINK_COUNT);
}
/// Cross-reference iter-0 corpus: broken / nearly-empty fetch
/// (21 words, 0 links — Reuters ME baseline). Must NOT be a hub.
#[test]
fn test_hub_detector_excludes_broken_low_link() {
let md = synth_hub(0, 21);
let r = make_result(&md);
let c = classify(&r);
assert!(!c.is_hub, "broken-fetch misclassified as hub; got {c:?}");
}
/// Models the CLI `--prefer-articles` decision point: on a
/// hub-classified page, the CLI replaces `mode=Full` with
/// `mode=Summary` so the summary emitter returns the link list
/// instead of the full body. Verify the two pieces compose correctly
/// (classifier says hub -> summary path produces a link section).
#[test]
fn test_prefer_articles_emits_link_list_on_hub() {
let md = synth_hub(30, 200);
let r = make_result(&md);
let c = classify(&r);
assert!(c.is_hub, "fixture must be hub-shaped; got {c:?}");
// When --prefer-articles is set and we're a hub, the CLI calls
// to_llm_summary instead of to_llm_text. The summary output must
// contain the link list, not body prose.
let summary = crate::llm::to_llm_summary(&r, Some("https://example.com/"));
assert!(summary.contains("## Links"), "summary missing Links header: {summary}");
assert!(
summary.contains("Story headline 0"),
"summary missing first link label: {summary}"
);
assert!(
summary.contains("https://example.com/story/0"),
"summary missing first link href: {summary}"
);
}
/// Negative-flag sentinel: when --prefer-articles is passed but the
/// page is NOT a hub (BBC-like rich aggregator), the classifier
/// returns is_hub=false and the CLI keeps the requested mode (Full).
/// This is the false-positive-resistance guarantee for p42_bbc_world.
#[test]
fn test_prefer_articles_falls_through_on_non_hub() {
let md = synth_hub(100, 2400);
let r = make_result(&md);
let c = classify(&r);
assert!(
!c.is_hub,
"non-hub aggregator must not flip with --prefer-articles; got {c:?}"
);
// CLI code path: if !is_hub, requested_mode is returned unchanged.
// Nothing extra to assert beyond is_hub=false — that's the contract
// the CLI's apply_hub_detection() honors.
}
/// Hint string mentions both signals + the suggested flag, so the
/// user-visible stderr message is actionable.
#[test]
fn test_hub_classification_hint_line_mentions_signals() {
let c = HubClassification {
is_hub: true,
word_count: 288,
link_count: 7,
};
let hint = c.hint_line();
assert!(hint.contains("288"), "missing word count: {hint}");
assert!(hint.contains('7'), "missing link count: {hint}");
assert!(
hint.contains("--prefer-articles"),
"missing flag suggestion: {hint}"
);
}
}

View file

@ -6,11 +6,13 @@
/// to a deduplicated section at the end.
mod body;
mod cleanup;
mod hub_detect;
mod images;
mod links;
mod metadata;
mod output_size;
pub use hub_detect::{classify as classify_hub, HubClassification};
pub use output_size::{
to_json_summary, to_json_toc, to_llm_summary, to_llm_toc, truncate_json_with_wrapper,
truncate_with_footer,