mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-11 22:55:13 +02:00
feat(core): JS-hub page detector + --prefer-articles flag
Detects ESPN-style hub pages (espn.com/nba/, /nfl/, /mlb/, /nhl/, /soccer/) where the rendered markup has nav-only content with no article bodies — chrome retry doesn't help because the data genuinely isn't in the markup. Heuristic: word_count < 500 AND link_count >= 5 against the extracted output. --prefer-articles: when set, a hub-classified page returns the extracted link list (reusing the M1 --mode summary machinery) instead of the sparse body. On non-hub pages, behavior is unchanged. stderr hint: always emitted on hub detection so the caller knows to drill /story/_/id/<id>/ URLs from a citation list. False-positive resistance verified: BBC News /world (link-heavy aggregator, 1500+ words body) and n1info.rs (widget-heavy but content-rich) both classify as non-hub and emit full extraction. 9 new tests in webclaw-core (317 -> 326).
This commit is contained in:
parent
339f41bb7c
commit
31a8f6150f
4 changed files with 383 additions and 4 deletions
|
|
@ -180,6 +180,14 @@ struct Cli {
|
|||
#[arg(long, default_value = "0")]
|
||||
max_output_bytes: u64,
|
||||
|
||||
/// When the page is detected as a JS hub (short body + nav-style link list,
|
||||
/// e.g. ESPN /nba /nfl /mlb /nhl /soccer), return only the extracted link
|
||||
/// list (equivalent to --mode summary). Non-hub pages are unchanged.
|
||||
/// A one-line stderr hint is also emitted on hub detection regardless of
|
||||
/// this flag, so callers can react on the next invocation.
|
||||
#[arg(long)]
|
||||
prefer_articles: bool,
|
||||
|
||||
/// Browser to impersonate
|
||||
#[arg(short, long, default_value = "chrome")]
|
||||
browser: Browser,
|
||||
|
|
@ -1122,6 +1130,37 @@ fn print_output_with_mode(
|
|||
println!("{out}");
|
||||
}
|
||||
|
||||
/// Apply iter-2 M2's hub-page detector. When a hub is detected:
|
||||
/// - emit a single stderr hint line (always — informational only),
|
||||
/// - if `prefer_articles` is on, override the OutputMode to `Summary`
|
||||
/// so the caller gets the link list directly without re-invoking.
|
||||
///
|
||||
/// Returns the effective `OutputMode` to use for emission. When no hub
|
||||
/// is detected or the result is from a non-local path (cloud), the input
|
||||
/// mode is returned unchanged and no stderr is written.
|
||||
///
|
||||
/// Designed to be additive — `prefer_articles=false` callers keep their
|
||||
/// existing stdout bytes byte-identical; the hint goes to stderr so it
|
||||
/// doesn't affect the sentinel byte-counting on p01-p15.
|
||||
fn apply_hub_detection(
|
||||
result: &ExtractionResult,
|
||||
requested_mode: &OutputMode,
|
||||
prefer_articles: bool,
|
||||
) -> OutputMode {
|
||||
let classification = webclaw_core::classify_hub(result);
|
||||
if !classification.is_hub {
|
||||
return requested_mode.clone();
|
||||
}
|
||||
// Always emit the informational hint on hub detection — stderr only.
|
||||
eprintln!("# hint: {}", classification.hint_line());
|
||||
if prefer_articles {
|
||||
// Caller asked us to honor the detection: switch to summary.
|
||||
OutputMode::Summary
|
||||
} else {
|
||||
requested_mode.clone()
|
||||
}
|
||||
}
|
||||
|
||||
/// Print cloud API response in the requested format.
|
||||
fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) {
|
||||
match format {
|
||||
|
|
@ -2754,6 +2793,7 @@ async fn main() {
|
|||
// Single-page extraction (handles both HTML and PDF via content-type detection)
|
||||
match fetch_and_extract(&cli).await {
|
||||
Ok(FetchOutput::Local(result)) => {
|
||||
let effective_mode = apply_hub_detection(&result, &cli.mode, cli.prefer_articles);
|
||||
if let Some(ref dir) = cli.output_dir {
|
||||
let url = cli
|
||||
.urls
|
||||
|
|
@ -2766,7 +2806,7 @@ async fn main() {
|
|||
&result,
|
||||
&cli.format,
|
||||
cli.metadata,
|
||||
&cli.mode,
|
||||
&effective_mode,
|
||||
cli.max_output_bytes,
|
||||
);
|
||||
if let Err(e) = write_to_file(dir, &filename, &content) {
|
||||
|
|
@ -2778,7 +2818,7 @@ async fn main() {
|
|||
&result,
|
||||
&cli.format,
|
||||
cli.metadata,
|
||||
&cli.mode,
|
||||
&effective_mode,
|
||||
cli.max_output_bytes,
|
||||
);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue