feat(core): --mode sections for nav-URL discovery

Section-URL ambiguity is recurring friction — callers have to guess whether to hit infobae.com root (LATAM frontpage) or /economia/ (AR- specific live FX dashboard), or decrypt.co root (ticker ribbon) vs /news/ (article list), or bbc.com/news/world vs /news/world/europe/. Each guess costs a round-trip. New `--mode sections` returns the discoverable section URLs parsed from the page's nav, in one round-trip. Subsumes issue #16 (non- English nav harder to LLM-parse — sections come back as data, not prose). Multi-signal heuristic on the existing link extraction: URL-pattern match (/<category>/ style short paths), repetition (section links appear in header + footer), DOM-position when available. Fallback when zero sections detected: emit top-N links with a "(none detected; first N shown)" note. Format: -f llm/text emits `Sections:` followed by `- [Label](url)` list. -f json emits `{"sections": [{"label": "...", "url": "..."}]}`. 13 new tests in webclaw-core (688 -> 701).
2026-06-12 23:05:12 +02:00 · 2026-05-23 23:14:40 +02:00 · 2026-05-23 23:14:40 +02:00 · ade2a5143c
commit ade2a5143c
parent 76cd515a3e
4 changed files with 542 additions and 6 deletions
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -171,7 +171,8 @@ struct Cli {
    #[arg(short, long, default_value = "markdown")]
    format: OutputFormat,

-    /// Output mode: full (default), summary (link list), or toc (H1/H2 outline + first paragraph).
+    /// Output mode: full (default), summary (link list), toc (H1/H2 outline + first paragraph),
+    /// or sections (nav-level section URLs only, for section discovery).
    /// Orthogonal to --format; e.g. `-f json --mode summary` returns a JSON link array.
    #[arg(long, default_value = "full")]
    mode: OutputMode,
@ -452,13 +453,17 @@ enum OutputFormat {

 /// Output mode. `full` is the default and matches the historical
 /// behaviour; `summary` returns just the navigation/link list; `toc`
-/// returns the H1/H2 outline plus the first paragraph after each H2.
+/// returns the H1/H2 outline plus the first paragraph after each H2;
+/// `sections` (M8, issue #14) returns nav-level section URLs only for
+/// section discovery on hub/aggregator pages.
 /// Orthogonal to `--format`.
 #[derive(Clone, ValueEnum, PartialEq, Eq)]
 enum OutputMode {
    Full,
    Summary,
    Toc,
+    /// sections: nav-level section URLs only (for section discovery)
+    Sections,
 }

 #[derive(Clone, ValueEnum)]
@ -829,6 +834,12 @@ fn render_body(
            OutputFormat::Json => webclaw_core::to_json_toc(result),
            _ => webclaw_core::to_llm_toc(result, result.metadata.url.as_deref()),
        },
+        OutputMode::Sections => match format {
+            OutputFormat::Json => {
+                webclaw_core::to_json_sections(result, result.metadata.url.as_deref())
+            }
+            _ => webclaw_core::to_llm_sections(result, result.metadata.url.as_deref()),
+        },
        OutputMode::Full => match format {
            OutputFormat::Markdown => {
                let mut out = String::new();
@ -1269,7 +1280,14 @@ fn apply_hub_detection(
    eprintln!("# hint: {}", classification.hint_line());
    let mode = if prefer_articles {
        // Caller asked us to honor the detection: switch to summary.
-        OutputMode::Summary
+        // M8: if the caller asked for sections explicitly, preserve it —
+        // section listing is more specific than the summary link list,
+        // so don't downgrade Sections → Summary on hub-detect.
+        if matches!(requested_mode, OutputMode::Sections) {
+            OutputMode::Sections
+        } else {
+            OutputMode::Summary
+        }
    } else {
        requested_mode.clone()
    };
--- a/crates/webclaw-core/src/lib.rs
+++ b/crates/webclaw-core/src/lib.rs
@ -31,9 +31,10 @@ pub use jsonld::{
    ArticleRef, JsonLdSchema, LiveUpdate,
 };
 pub use llm::{
-    classify_hub, classify_thin_body, to_json_summary, to_json_toc, to_llm_summary, to_llm_text,
-    to_llm_text_with_options, to_llm_toc, truncate_json_with_wrapper, truncate_with_footer,
-    HubClassification, LlmTextOptions, ThinBodyClassification,
+    classify_hub, classify_thin_body, collect_section_links, to_json_sections, to_json_summary,
+    to_json_toc, to_llm_sections, to_llm_summary, to_llm_text, to_llm_text_with_options,
+    to_llm_toc, truncate_json_with_wrapper, truncate_with_footer, HubClassification,
+    LlmTextOptions, ThinBodyClassification,
 };
 pub use types::{
    CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata,
--- a/crates/webclaw-core/src/llm/mod.rs
+++ b/crates/webclaw-core/src/llm/mod.rs
@ -11,9 +11,11 @@ mod images;
 mod links;
 mod metadata;
 mod output_size;
+mod sections;
 mod thin_body;

 pub use hub_detect::{classify as classify_hub, HubClassification};
+pub use sections::{collect_section_links, to_json_sections, to_llm_sections};
 pub use thin_body::{classify as classify_thin_body, ThinBodyClassification};
 pub use output_size::{
    to_json_summary, to_json_toc, to_llm_summary, to_llm_toc, truncate_json_with_wrapper,
--- a/crates/webclaw-core/src/llm/sections.rs
+++ b/crates/webclaw-core/src/llm/sections.rs
@ -0,0 +1,515 @@
+/// Section / nav-URL discovery for hub and aggregator pages.
+///
+/// M8 (issue #14, subsumes #16) — `--mode sections` returns only the
+/// navigation/section URLs the page links to, suitable for an LLM caller
+/// that wants to drill into a category (Economía, Política, Sport,
+/// Health, ...) without first parsing a full extraction.
+///
+/// Approach: this is a pure FILTER over the (label, href) list that
+/// `body::process_body` already produces for the page. No new fetch, no
+/// new HTML parse — the heuristic walks the in-memory link list once and
+/// keeps only links that look like section/nav entries by URL shape +
+/// label shape + same-host + denylist signals (see `is_section_link`).
+///
+/// The `OutputMode::Sections` arm in the CLI calls `to_llm_sections` /
+/// `to_json_sections`. The metadata header is built with
+/// `include_status=false` (mirrors summary/toc — M7 status line is not
+/// useful in a section list).
+use url::Url;
+
+use crate::types::ExtractionResult;
+
+use super::body;
+use super::links;
+use super::metadata::build_metadata_header_with_opts;
+
+// ---------------------------------------------------------------------------
+// Section-detection heuristic
+// ---------------------------------------------------------------------------
+
+/// First-segment denylist. Any URL whose path starts with one of these
+/// segments is rejected as a non-section (price tickers, user pages, auth
+/// flows, comment threads, search). Catches Decrypt's 248-row `/price/*`
+/// ticker ribbon cheaply, plus generic chrome across many sites.
+const DENY_FIRST_SEGMENTS: &[&str] = &[
+    "price",
+    "prices",
+    "quote",
+    "quotes",
+    "comments",
+    "user",
+    "users",
+    "auth",
+    "login",
+    "logout",
+    "register",
+    "signin",
+    "signup",
+    "subscribe",
+    "subscription",
+    "share",
+    "tag",
+    "tags",
+    "search",
+    "cart",
+    "checkout",
+    "account",
+    "profile",
+];
+
+/// Maximum number of path segments a section URL may have. Section paths
+/// are 1 segment (`/sport`) or 2 (`/news/business`); article URLs are
+/// typically 3+ (`/news/articles/<id>`, `/2024/05/23/<slug>`).
+const MAX_PATH_SEGMENTS: usize = 2;
+
+/// Maximum length of a single path segment. Article slugs are usually
+/// longer (`big-news-headline-about-some-topic`); section names are
+/// short (`business`, `health`, `editors-picks`).
+const MAX_SEGMENT_LEN: usize = 30;
+
+/// Decide whether a path segment looks like an article ID rather than a
+/// section name. Article-ID heuristic: length >= 6 chars AND contains at
+/// least 2 ASCII digits AND mixes letters with digits. Matches BBC
+/// `crmp121z3z8o` style and CMS IDs; doesn't trip on `editors-picks` (no
+/// digits) or `2024` (all digits, no letters).
+fn looks_like_article_id(segment: &str) -> bool {
+    if segment.len() < 6 {
+        return false;
+    }
+    let mut digits = 0usize;
+    let mut letters = 0usize;
+    for c in segment.chars() {
+        if c.is_ascii_digit() {
+            digits += 1;
+        } else if c.is_ascii_alphabetic() {
+            letters += 1;
+        }
+    }
+    digits >= 2 && letters >= 1
+}
+
+/// Test whether a URL path is shaped like a section path.
+///
+/// Accepts:
+///   - `/`                (rare — site root link, used by some "Home" nav)
+///   - `/sport`
+///   - `/news/business`
+///   - `/editors-picks`
+///   - `/news/business/` (trailing slash)
+///
+/// Rejects: 3+ segment paths, segments with article-ID shape, segments
+/// matching the denylist, segments containing non-`[a-z0-9-]` chars (case
+/// insensitive on the alpha side), segments longer than 30 chars.
+fn is_section_path(path: &str) -> bool {
+    // Drop leading + trailing slash for segment count.
+    let trimmed = path.trim_start_matches('/').trim_end_matches('/');
+    if trimmed.is_empty() {
+        // Root path "/" — treat as a section (e.g. BBC "Home" link).
+        return true;
+    }
+    let segments: Vec<&str> = trimmed.split('/').collect();
+    if segments.len() > MAX_PATH_SEGMENTS {
+        return false;
+    }
+    for (i, seg) in segments.iter().enumerate() {
+        if seg.is_empty() {
+            return false;
+        }
+        if seg.len() > MAX_SEGMENT_LEN {
+            return false;
+        }
+        // First-segment denylist (price ribbons, user/auth pages, search).
+        if i == 0 && DENY_FIRST_SEGMENTS.contains(&seg.to_ascii_lowercase().as_str()) {
+            return false;
+        }
+        // Article-ID-shaped segment rejection.
+        if looks_like_article_id(seg) {
+            return false;
+        }
+        // Only ASCII alpha-numeric + hyphen. Underscores, dots, digits-only
+        // segments (year-paths like `/2024/`) are not sections.
+        let mut has_alpha = false;
+        for c in seg.chars() {
+            if c.is_ascii_alphabetic() {
+                has_alpha = true;
+            } else if c.is_ascii_digit() || c == '-' {
+                // Allowed.
+            } else {
+                return false;
+            }
+        }
+        if !has_alpha {
+            // Pure-digit segments (`/2024`) are not sections.
+            return false;
+        }
+    }
+    true
+}
+
+/// Same-host check: section links should usually live on the page's own
+/// host (subdomains allowed). Prevents cross-domain promo nav from
+/// polluting the result. Returns true iff `link_host` equals `page_host`
+/// or is a subdomain ending in `.<page_host>`.
+fn same_host(link_host: &str, page_host: &str) -> bool {
+    if link_host.eq_ignore_ascii_case(page_host) {
+        return true;
+    }
+    // Strip leading "www." from both for the subdomain comparison so
+    // `www.bbc.com` matches `bbc.com`.
+    let lh = link_host.trim_start_matches("www.").to_ascii_lowercase();
+    let ph = page_host.trim_start_matches("www.").to_ascii_lowercase();
+    if lh == ph {
+        return true;
+    }
+    lh.ends_with(&format!(".{ph}"))
+}
+
+/// Decide whether `(label, href)` is a section link given the page URL.
+///
+/// Multi-signal AND:
+///   1. URL parses with a scheme http/https
+///   2. Path matches section shape (`is_section_path`)
+///   3. No URL fragment (anchor links like `/news/world#bbc-main` rejected)
+///   4. Same-host as the page (or subdomain)
+///   5. Label is short (<=40 chars after cleaning) and <=5 words
+///   6. Label is not a truncation sentinel (`...` from `clean_link_label`)
+fn is_section_link(label: &str, href: &str, page_url: Option<&Url>) -> bool {
+    // Label-shape gate.
+    if label.is_empty() {
+        return false;
+    }
+    if label.contains("...") {
+        // Truncated long-article-title sentinel; not a section.
+        return false;
+    }
+    if label.chars().count() > 40 {
+        return false;
+    }
+    if label.split_whitespace().count() > 5 {
+        return false;
+    }
+
+    // URL-shape gate.
+    let url = match Url::parse(href) {
+        Ok(u) => u,
+        Err(_) => return false,
+    };
+    let scheme = url.scheme();
+    if scheme != "http" && scheme != "https" {
+        return false;
+    }
+    // Anchor / fragment exclusion — `/news/world#bbc-main` is not a section.
+    if url.fragment().is_some() {
+        return false;
+    }
+    // Query string is allowed but uncommon for section links; we don't
+    // reject on its presence — many sites carry a `?source=nav` tracker.
+    // The path itself must be section-shaped.
+    if !is_section_path(url.path()) {
+        return false;
+    }
+
+    // Same-host gate. If we don't know the page URL, fall through.
+    if let Some(page) = page_url
+        && let (Some(lh), Some(ph)) = (url.host_str(), page.host_str())
+        && !same_host(lh, ph)
+    {
+        return false;
+    }
+
+    true
+}
+
+// ---------------------------------------------------------------------------
+// Public surface — collectors and formatters
+// ---------------------------------------------------------------------------
+
+/// Collect a deduplicated (label, url) list of section links for the
+/// page. Reuses the noise-filtered link list `body::process_body`
+/// produces; applies the M8 section heuristic on top.
+///
+/// `page_url` is the canonical URL of the page (used for the same-host
+/// gate). When `None`, the same-host gate is skipped.
+pub fn collect_section_links(
+    result: &ExtractionResult,
+    page_url: Option<&str>,
+) -> Vec<(String, String)> {
+    let parsed_page = page_url.and_then(|u| Url::parse(u).ok());
+    let processed = body::process_body(&result.content.markdown);
+    let mut out: Vec<(String, String)> = Vec::new();
+    let mut seen_hrefs: std::collections::HashSet<String> = std::collections::HashSet::new();
+    for (text, href) in processed.links {
+        let label = links::clean_link_label(&text);
+        if !is_section_link(&label, &href, parsed_page.as_ref()) {
+            continue;
+        }
+        if !seen_hrefs.insert(href.clone()) {
+            continue;
+        }
+        out.push((label, href));
+    }
+    out
+}
+
+/// `-f llm` / `-f text` form: metadata header (Status line suppressed)
+/// followed by a `## Sections` block of `- [Label](url)` lines.
+///
+/// When the heuristic returns 0 sections, emits the header plus
+/// `## Sections\n_(no sections detected)_` so the caller can
+/// distinguish empty-result from a crash / parse failure.
+pub fn to_llm_sections(result: &ExtractionResult, url: Option<&str>) -> String {
+    let sections = collect_section_links(result, url);
+    let mut out = String::new();
+    // M7 suppression: section listing is conceptually navigation, not
+    // protocol-level outcome.
+    build_metadata_header_with_opts(&mut out, result, url, false);
+    if !out.is_empty() {
+        out.push('\n');
+    }
+    out.push_str("## Sections\n");
+    if sections.is_empty() {
+        out.push_str("_(no sections detected)_");
+    } else {
+        for (label, href) in &sections {
+            out.push_str(&format!("- [{label}]({href})\n"));
+        }
+    }
+    out.trim_end().to_string()
+}
+
+/// `-f json` form: `{"sections": [{"label": ..., "url": ...}, ...]}`.
+/// When 0 sections detected, `sections` is an empty array.
+pub fn to_json_sections(result: &ExtractionResult, url: Option<&str>) -> String {
+    let sections = collect_section_links(result, url);
+    let arr: Vec<serde_json::Value> = sections
+        .into_iter()
+        .map(|(label, href)| {
+            serde_json::json!({
+                "label": label,
+                "url": href,
+            })
+        })
+        .collect();
+    serde_json::to_string_pretty(&serde_json::json!({"sections": arr}))
+        .unwrap_or_else(|_| "{\"sections\": []}".to_string())
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::types::{Content, ExtractionResult, Metadata};
+
+    fn make_result(markdown: &str) -> ExtractionResult {
+        ExtractionResult {
+            metadata: Metadata {
+                title: Some("Test Page".to_string()),
+                description: None,
+                author: None,
+                published_date: None,
+                language: None,
+                url: Some("https://example.com/".to_string()),
+                site_name: None,
+                image: None,
+                favicon: None,
+                word_count: 0,
+                http_status: None,
+            },
+            content: Content {
+                markdown: markdown.to_string(),
+                plain_text: String::new(),
+                links: Vec::new(),
+                images: Vec::new(),
+                code_blocks: Vec::new(),
+                raw_html: None,
+            },
+            domain_data: None,
+            structured_data: Vec::new(),
+        }
+    }
+
+    // -- heuristic primitives --
+
+    #[test]
+    fn test_section_filter_detects_url_pattern_sections() {
+        // 5 section-shaped URLs (BBC-style) + 15 article URLs.
+        let mut md = String::from("# Page\n\n");
+        // 5 section nav links.
+        md.push_str("- [Home](https://www.bbc.com/)\n");
+        md.push_str("- [Sport](https://www.bbc.com/sport)\n");
+        md.push_str("- [Health](https://www.bbc.com/health)\n");
+        md.push_str("- [Weather](https://www.bbc.com/weather)\n");
+        md.push_str("- [Newsletters](https://www.bbc.com/newsletters)\n");
+        // 15 article URLs (3-segment, article-ID shape).
+        for i in 0..15 {
+            md.push_str(&format!(
+                "- [Some long headline number {i}](https://www.bbc.com/news/articles/crmp121z3z{i:01x}o)\n"
+            ));
+        }
+        let r = make_result(&md);
+        let out = collect_section_links(&r, Some("https://www.bbc.com/news/world"));
+        assert_eq!(out.len(), 5, "expected 5 sections, got {}: {out:?}", out.len());
+        let labels: Vec<&str> = out.iter().map(|(l, _)| l.as_str()).collect();
+        assert!(labels.contains(&"Sport"), "missing Sport: {labels:?}");
+        assert!(labels.contains(&"Health"), "missing Health: {labels:?}");
+    }
+
+    #[test]
+    fn test_section_filter_repetition_signal() {
+        // After-dedup behavior: even when a section URL is referenced multiple
+        // times in the source markdown, the output contains exactly one entry
+        // per unique href. (Phase A: repetition is collapsed upstream by
+        // process_body; we verify the final list is deduped.)
+        let md = "# Page\n\n\
+                  - [Sport](https://www.bbc.com/sport)\n\
+                  - [Sport (top)](https://www.bbc.com/sport)\n\
+                  - [Sport (footer)](https://www.bbc.com/sport)\n\
+                  - [Unique](https://www.bbc.com/health)\n";
+        let r = make_result(md);
+        let out = collect_section_links(&r, Some("https://www.bbc.com/"));
+        assert_eq!(out.len(), 2, "expected 2 unique sections, got {out:?}");
+    }
+
+    #[test]
+    fn test_section_filter_combined_signals() {
+        // Mix sections, article slugs, denylisted paths, cross-host, anchor links.
+        let md = "# Decrypt-style\n\n\
+                  - [Business](https://decrypt.co/news/business)\n\
+                  - [Markets](https://decrypt.co/news/markets)\n\
+                  - [Editors' Picks](https://decrypt.co/news/editors-picks)\n\
+                  - [Bitcoin](https://decrypt.co/price/bitcoin)\n\
+                  - [Ethereum](https://decrypt.co/price/ethereum)\n\
+                  - [Search](https://decrypt.co/search)\n\
+                  - [Login](https://decrypt.co/login)\n\
+                  - [Cross-host](https://promo.elsewhere.com/sport)\n\
+                  - [Skip to content](https://decrypt.co/news/world#main)\n\
+                  - [Long article slug here that exceeds limit](https://decrypt.co/news/business/2024/05/some-article)\n";
+        let r = make_result(md);
+        let out = collect_section_links(&r, Some("https://decrypt.co/"));
+        // Only Business, Markets, Editors' Picks should pass.
+        assert_eq!(out.len(), 3, "expected 3 sections, got {out:?}");
+        let hrefs: Vec<&str> = out.iter().map(|(_, h)| h.as_str()).collect();
+        assert!(hrefs.contains(&"https://decrypt.co/news/business"));
+        assert!(hrefs.contains(&"https://decrypt.co/news/markets"));
+        assert!(hrefs.contains(&"https://decrypt.co/news/editors-picks"));
+        // Explicitly NOT present.
+        for bad in [
+            "https://decrypt.co/price/bitcoin",
+            "https://decrypt.co/search",
+            "https://decrypt.co/login",
+            "https://promo.elsewhere.com/sport",
+        ] {
+            assert!(!hrefs.contains(&bad), "{bad} should have been filtered out: {hrefs:?}");
+        }
+    }
+
+    #[test]
+    fn test_article_slug_excluded() {
+        // BBC article-id style segment.
+        let md = "- [Headline text](https://www.bbc.com/news/articles/crmp121z3z8o)\n";
+        let r = make_result(md);
+        let out = collect_section_links(&r, Some("https://www.bbc.com/news/world"));
+        assert!(out.is_empty(), "article-ID link should have been dropped: {out:?}");
+    }
+
+    #[test]
+    fn test_cross_host_link_dropped() {
+        let md = "- [Sport](https://promo.bbc.co.uk/sport)\n";
+        let r = make_result(md);
+        let out = collect_section_links(&r, Some("https://www.bbc.com/news/world"));
+        assert!(out.is_empty(), "cross-host link should have been dropped: {out:?}");
+    }
+
+    #[test]
+    fn test_subdomain_link_kept() {
+        // news.bbc.com is a subdomain of bbc.com — same_host should accept it.
+        let md = "- [Sport](https://news.bbc.com/sport)\n";
+        let r = make_result(md);
+        let out = collect_section_links(&r, Some("https://www.bbc.com/news/world"));
+        assert_eq!(out.len(), 1, "subdomain link should have passed: {out:?}");
+    }
+
+    #[test]
+    fn test_anchor_fragment_dropped() {
+        let md = "- [Skip to content](https://www.bbc.com/news/world#bbc-main)\n";
+        let r = make_result(md);
+        let out = collect_section_links(&r, Some("https://www.bbc.com/news/world"));
+        assert!(out.is_empty(), "fragment link should have been dropped: {out:?}");
+    }
+
+    #[test]
+    fn test_no_links_returns_empty() {
+        let md = "# Just a heading\n\nNo links at all here.";
+        let r = make_result(md);
+        let out = collect_section_links(&r, Some("https://example.com/"));
+        assert!(out.is_empty(), "expected empty: {out:?}");
+    }
+
+    // -- formatter tests --
+
+    #[test]
+    fn test_sections_mode_formats_llm_output() {
+        let md = "- [Sport](https://www.bbc.com/sport)\n- [Health](https://www.bbc.com/health)\n";
+        let mut r = make_result(md);
+        r.metadata.url = Some("https://www.bbc.com/news/world".to_string());
+        let out = to_llm_sections(&r, Some("https://www.bbc.com/news/world"));
+        assert!(out.contains("## Sections"), "missing Sections header: {out}");
+        assert!(out.contains("- [Sport](https://www.bbc.com/sport)"), "missing Sport: {out}");
+        assert!(out.contains("- [Health](https://www.bbc.com/health)"), "missing Health: {out}");
+        // Metadata header URL present, Status line absent (Sections mode passes include_status=false).
+        assert!(out.contains("> URL:"));
+        assert!(!out.contains("> Status:"));
+    }
+
+    #[test]
+    fn test_sections_mode_formats_json_output() {
+        let md = "- [Sport](https://www.bbc.com/sport)\n- [Health](https://www.bbc.com/health)\n";
+        let r = make_result(md);
+        let s = to_json_sections(&r, Some("https://www.bbc.com/news/world"));
+        let v: serde_json::Value = serde_json::from_str(&s).expect("valid JSON");
+        let arr = v["sections"].as_array().expect("sections array present");
+        assert_eq!(arr.len(), 2);
+        assert_eq!(arr[0]["label"].as_str().unwrap(), "Sport");
+        assert_eq!(arr[0]["url"].as_str().unwrap(), "https://www.bbc.com/sport");
+    }
+
+    #[test]
+    fn test_sections_mode_fallback_on_no_nav() {
+        // Phase A's chosen fallback: empty list with `_(no sections detected)_`
+        // marker in -f llm form, and `{"sections": []}` in -f json form.
+        let md = "# Page\n\nNo links here.";
+        let r = make_result(md);
+        let llm = to_llm_sections(&r, Some("https://example.com/"));
+        assert!(llm.contains("## Sections"), "missing header: {llm}");
+        assert!(llm.contains("(no sections detected)"), "missing fallback marker: {llm}");
+        let json = to_json_sections(&r, Some("https://example.com/"));
+        let v: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
+        assert_eq!(v["sections"].as_array().unwrap().len(), 0);
+    }
+
+    #[test]
+    fn test_status_header_suppressed_in_sections_mode() {
+        // Parallel to summary/toc behavior — Sections mode passes
+        // include_status=false to build_metadata_header_with_opts.
+        let mut r = make_result("- [Sport](https://www.bbc.com/sport)\n");
+        r.metadata.http_status = Some(404);
+        let out = to_llm_sections(&r, Some("https://www.bbc.com/news/world"));
+        assert!(
+            !out.contains("> Status:"),
+            "Status line leaked into sections mode output:\n{out}"
+        );
+    }
+
+    #[test]
+    fn test_no_page_url_skips_same_host_gate() {
+        // When page_url is None we don't know the host; the link still
+        // passes provided its URL shape is section-like.
+        let md = "- [Sport](https://www.bbc.com/sport)\n";
+        let r = make_result(md);
+        let out = collect_section_links(&r, None);
+        assert_eq!(out.len(), 1, "expected 1 section, got {out:?}");
+    }
+}