mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-10 22:45:13 +02:00
feat(core): --mode sections for nav-URL discovery
Section-URL ambiguity is recurring friction — callers have to guess whether to hit infobae.com root (LATAM frontpage) or /economia/ (AR- specific live FX dashboard), or decrypt.co root (ticker ribbon) vs /news/ (article list), or bbc.com/news/world vs /news/world/europe/. Each guess costs a round-trip. New `--mode sections` returns the discoverable section URLs parsed from the page's nav, in one round-trip. Subsumes issue #16 (non- English nav harder to LLM-parse — sections come back as data, not prose). Multi-signal heuristic on the existing link extraction: URL-pattern match (/<category>/ style short paths), repetition (section links appear in header + footer), DOM-position when available. Fallback when zero sections detected: emit top-N links with a "(none detected; first N shown)" note. Format: -f llm/text emits `Sections:` followed by `- [Label](url)` list. -f json emits `{"sections": [{"label": "...", "url": "..."}]}`. 13 new tests in webclaw-core (688 -> 701).
This commit is contained in:
parent
76cd515a3e
commit
ade2a5143c
4 changed files with 542 additions and 6 deletions
|
|
@ -171,7 +171,8 @@ struct Cli {
|
|||
#[arg(short, long, default_value = "markdown")]
|
||||
format: OutputFormat,
|
||||
|
||||
/// Output mode: full (default), summary (link list), or toc (H1/H2 outline + first paragraph).
|
||||
/// Output mode: full (default), summary (link list), toc (H1/H2 outline + first paragraph),
|
||||
/// or sections (nav-level section URLs only, for section discovery).
|
||||
/// Orthogonal to --format; e.g. `-f json --mode summary` returns a JSON link array.
|
||||
#[arg(long, default_value = "full")]
|
||||
mode: OutputMode,
|
||||
|
|
@ -452,13 +453,17 @@ enum OutputFormat {
|
|||
|
||||
/// Output mode. `full` is the default and matches the historical
|
||||
/// behaviour; `summary` returns just the navigation/link list; `toc`
|
||||
/// returns the H1/H2 outline plus the first paragraph after each H2.
|
||||
/// returns the H1/H2 outline plus the first paragraph after each H2;
|
||||
/// `sections` (M8, issue #14) returns nav-level section URLs only for
|
||||
/// section discovery on hub/aggregator pages.
|
||||
/// Orthogonal to `--format`.
|
||||
#[derive(Clone, ValueEnum, PartialEq, Eq)]
|
||||
enum OutputMode {
|
||||
Full,
|
||||
Summary,
|
||||
Toc,
|
||||
/// sections: nav-level section URLs only (for section discovery)
|
||||
Sections,
|
||||
}
|
||||
|
||||
#[derive(Clone, ValueEnum)]
|
||||
|
|
@ -829,6 +834,12 @@ fn render_body(
|
|||
OutputFormat::Json => webclaw_core::to_json_toc(result),
|
||||
_ => webclaw_core::to_llm_toc(result, result.metadata.url.as_deref()),
|
||||
},
|
||||
OutputMode::Sections => match format {
|
||||
OutputFormat::Json => {
|
||||
webclaw_core::to_json_sections(result, result.metadata.url.as_deref())
|
||||
}
|
||||
_ => webclaw_core::to_llm_sections(result, result.metadata.url.as_deref()),
|
||||
},
|
||||
OutputMode::Full => match format {
|
||||
OutputFormat::Markdown => {
|
||||
let mut out = String::new();
|
||||
|
|
@ -1269,7 +1280,14 @@ fn apply_hub_detection(
|
|||
eprintln!("# hint: {}", classification.hint_line());
|
||||
let mode = if prefer_articles {
|
||||
// Caller asked us to honor the detection: switch to summary.
|
||||
OutputMode::Summary
|
||||
// M8: if the caller asked for sections explicitly, preserve it —
|
||||
// section listing is more specific than the summary link list,
|
||||
// so don't downgrade Sections → Summary on hub-detect.
|
||||
if matches!(requested_mode, OutputMode::Sections) {
|
||||
OutputMode::Sections
|
||||
} else {
|
||||
OutputMode::Summary
|
||||
}
|
||||
} else {
|
||||
requested_mode.clone()
|
||||
};
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue