feat(core): --mode sections for nav-URL discovery

Section-URL ambiguity is recurring friction — callers have to guess
whether to hit infobae.com root (LATAM frontpage) or /economia/ (AR-
specific live FX dashboard), or decrypt.co root (ticker ribbon) vs
/news/ (article list), or bbc.com/news/world vs /news/world/europe/.
Each guess costs a round-trip.

New `--mode sections` returns the discoverable section URLs parsed
from the page's nav, in one round-trip. Subsumes issue #16 (non-
English nav harder to LLM-parse — sections come back as data, not
prose).

Multi-signal heuristic on the existing link extraction:
URL-pattern match (/<category>/ style short paths), repetition
(section links appear in header + footer), DOM-position when
available. Fallback when zero sections detected: emit top-N links
with a "(none detected; first N shown)" note.

Format: -f llm/text emits `Sections:` followed by `- [Label](url)`
list. -f json emits `{"sections": [{"label": "...", "url": "..."}]}`.

13 new tests in webclaw-core (688 -> 701).
This commit is contained in:
devnen 2026-05-23 23:14:40 +02:00
parent 76cd515a3e
commit ade2a5143c
4 changed files with 542 additions and 6 deletions

View file

@ -171,7 +171,8 @@ struct Cli {
#[arg(short, long, default_value = "markdown")]
format: OutputFormat,
/// Output mode: full (default), summary (link list), or toc (H1/H2 outline + first paragraph).
/// Output mode: full (default), summary (link list), toc (H1/H2 outline + first paragraph),
/// or sections (nav-level section URLs only, for section discovery).
/// Orthogonal to --format; e.g. `-f json --mode summary` returns a JSON link array.
#[arg(long, default_value = "full")]
mode: OutputMode,
@ -452,13 +453,17 @@ enum OutputFormat {
/// Output mode. `full` is the default and matches the historical
/// behaviour; `summary` returns just the navigation/link list; `toc`
/// returns the H1/H2 outline plus the first paragraph after each H2.
/// returns the H1/H2 outline plus the first paragraph after each H2;
/// `sections` (M8, issue #14) returns nav-level section URLs only for
/// section discovery on hub/aggregator pages.
/// Orthogonal to `--format`.
#[derive(Clone, ValueEnum, PartialEq, Eq)]
enum OutputMode {
Full,
Summary,
Toc,
/// sections: nav-level section URLs only (for section discovery)
Sections,
}
#[derive(Clone, ValueEnum)]
@ -829,6 +834,12 @@ fn render_body(
OutputFormat::Json => webclaw_core::to_json_toc(result),
_ => webclaw_core::to_llm_toc(result, result.metadata.url.as_deref()),
},
OutputMode::Sections => match format {
OutputFormat::Json => {
webclaw_core::to_json_sections(result, result.metadata.url.as_deref())
}
_ => webclaw_core::to_llm_sections(result, result.metadata.url.as_deref()),
},
OutputMode::Full => match format {
OutputFormat::Markdown => {
let mut out = String::new();
@ -1269,7 +1280,14 @@ fn apply_hub_detection(
eprintln!("# hint: {}", classification.hint_line());
let mode = if prefer_articles {
// Caller asked us to honor the detection: switch to summary.
OutputMode::Summary
// M8: if the caller asked for sections explicitly, preserve it —
// section listing is more specific than the summary link list,
// so don't downgrade Sections → Summary on hub-detect.
if matches!(requested_mode, OutputMode::Sections) {
OutputMode::Sections
} else {
OutputMode::Summary
}
} else {
requested_mode.clone()
};