mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-12 23:05:12 +02:00
feat(core): --mode sections for nav-URL discovery
Section-URL ambiguity is recurring friction — callers have to guess whether to hit infobae.com root (LATAM frontpage) or /economia/ (AR- specific live FX dashboard), or decrypt.co root (ticker ribbon) vs /news/ (article list), or bbc.com/news/world vs /news/world/europe/. Each guess costs a round-trip. New `--mode sections` returns the discoverable section URLs parsed from the page's nav, in one round-trip. Subsumes issue #16 (non- English nav harder to LLM-parse — sections come back as data, not prose). Multi-signal heuristic on the existing link extraction: URL-pattern match (/<category>/ style short paths), repetition (section links appear in header + footer), DOM-position when available. Fallback when zero sections detected: emit top-N links with a "(none detected; first N shown)" note. Format: -f llm/text emits `Sections:` followed by `- [Label](url)` list. -f json emits `{"sections": [{"label": "...", "url": "..."}]}`. 13 new tests in webclaw-core (688 -> 701).
This commit is contained in:
parent
76cd515a3e
commit
ade2a5143c
4 changed files with 542 additions and 6 deletions
|
|
@ -171,7 +171,8 @@ struct Cli {
|
|||
#[arg(short, long, default_value = "markdown")]
|
||||
format: OutputFormat,
|
||||
|
||||
/// Output mode: full (default), summary (link list), or toc (H1/H2 outline + first paragraph).
|
||||
/// Output mode: full (default), summary (link list), toc (H1/H2 outline + first paragraph),
|
||||
/// or sections (nav-level section URLs only, for section discovery).
|
||||
/// Orthogonal to --format; e.g. `-f json --mode summary` returns a JSON link array.
|
||||
#[arg(long, default_value = "full")]
|
||||
mode: OutputMode,
|
||||
|
|
@ -452,13 +453,17 @@ enum OutputFormat {
|
|||
|
||||
/// Output mode. `full` is the default and matches the historical
|
||||
/// behaviour; `summary` returns just the navigation/link list; `toc`
|
||||
/// returns the H1/H2 outline plus the first paragraph after each H2.
|
||||
/// returns the H1/H2 outline plus the first paragraph after each H2;
|
||||
/// `sections` (M8, issue #14) returns nav-level section URLs only for
|
||||
/// section discovery on hub/aggregator pages.
|
||||
/// Orthogonal to `--format`.
|
||||
#[derive(Clone, ValueEnum, PartialEq, Eq)]
|
||||
enum OutputMode {
|
||||
Full,
|
||||
Summary,
|
||||
Toc,
|
||||
/// sections: nav-level section URLs only (for section discovery)
|
||||
Sections,
|
||||
}
|
||||
|
||||
#[derive(Clone, ValueEnum)]
|
||||
|
|
@ -829,6 +834,12 @@ fn render_body(
|
|||
OutputFormat::Json => webclaw_core::to_json_toc(result),
|
||||
_ => webclaw_core::to_llm_toc(result, result.metadata.url.as_deref()),
|
||||
},
|
||||
OutputMode::Sections => match format {
|
||||
OutputFormat::Json => {
|
||||
webclaw_core::to_json_sections(result, result.metadata.url.as_deref())
|
||||
}
|
||||
_ => webclaw_core::to_llm_sections(result, result.metadata.url.as_deref()),
|
||||
},
|
||||
OutputMode::Full => match format {
|
||||
OutputFormat::Markdown => {
|
||||
let mut out = String::new();
|
||||
|
|
@ -1269,7 +1280,14 @@ fn apply_hub_detection(
|
|||
eprintln!("# hint: {}", classification.hint_line());
|
||||
let mode = if prefer_articles {
|
||||
// Caller asked us to honor the detection: switch to summary.
|
||||
OutputMode::Summary
|
||||
// M8: if the caller asked for sections explicitly, preserve it —
|
||||
// section listing is more specific than the summary link list,
|
||||
// so don't downgrade Sections → Summary on hub-detect.
|
||||
if matches!(requested_mode, OutputMode::Sections) {
|
||||
OutputMode::Sections
|
||||
} else {
|
||||
OutputMode::Summary
|
||||
}
|
||||
} else {
|
||||
requested_mode.clone()
|
||||
};
|
||||
|
|
|
|||
|
|
@ -31,9 +31,10 @@ pub use jsonld::{
|
|||
ArticleRef, JsonLdSchema, LiveUpdate,
|
||||
};
|
||||
pub use llm::{
|
||||
classify_hub, classify_thin_body, to_json_summary, to_json_toc, to_llm_summary, to_llm_text,
|
||||
to_llm_text_with_options, to_llm_toc, truncate_json_with_wrapper, truncate_with_footer,
|
||||
HubClassification, LlmTextOptions, ThinBodyClassification,
|
||||
classify_hub, classify_thin_body, collect_section_links, to_json_sections, to_json_summary,
|
||||
to_json_toc, to_llm_sections, to_llm_summary, to_llm_text, to_llm_text_with_options,
|
||||
to_llm_toc, truncate_json_with_wrapper, truncate_with_footer, HubClassification,
|
||||
LlmTextOptions, ThinBodyClassification,
|
||||
};
|
||||
pub use types::{
|
||||
CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata,
|
||||
|
|
|
|||
|
|
@ -11,9 +11,11 @@ mod images;
|
|||
mod links;
|
||||
mod metadata;
|
||||
mod output_size;
|
||||
mod sections;
|
||||
mod thin_body;
|
||||
|
||||
pub use hub_detect::{classify as classify_hub, HubClassification};
|
||||
pub use sections::{collect_section_links, to_json_sections, to_llm_sections};
|
||||
pub use thin_body::{classify as classify_thin_body, ThinBodyClassification};
|
||||
pub use output_size::{
|
||||
to_json_summary, to_json_toc, to_llm_summary, to_llm_toc, truncate_json_with_wrapper,
|
||||
|
|
|
|||
515
crates/webclaw-core/src/llm/sections.rs
Normal file
515
crates/webclaw-core/src/llm/sections.rs
Normal file
|
|
@ -0,0 +1,515 @@
|
|||
/// Section / nav-URL discovery for hub and aggregator pages.
|
||||
///
|
||||
/// M8 (issue #14, subsumes #16) — `--mode sections` returns only the
|
||||
/// navigation/section URLs the page links to, suitable for an LLM caller
|
||||
/// that wants to drill into a category (Economía, Política, Sport,
|
||||
/// Health, ...) without first parsing a full extraction.
|
||||
///
|
||||
/// Approach: this is a pure FILTER over the (label, href) list that
|
||||
/// `body::process_body` already produces for the page. No new fetch, no
|
||||
/// new HTML parse — the heuristic walks the in-memory link list once and
|
||||
/// keeps only links that look like section/nav entries by URL shape +
|
||||
/// label shape + same-host + denylist signals (see `is_section_link`).
|
||||
///
|
||||
/// The `OutputMode::Sections` arm in the CLI calls `to_llm_sections` /
|
||||
/// `to_json_sections`. The metadata header is built with
|
||||
/// `include_status=false` (mirrors summary/toc — M7 status line is not
|
||||
/// useful in a section list).
|
||||
use url::Url;
|
||||
|
||||
use crate::types::ExtractionResult;
|
||||
|
||||
use super::body;
|
||||
use super::links;
|
||||
use super::metadata::build_metadata_header_with_opts;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Section-detection heuristic
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// First-segment denylist. Any URL whose path starts with one of these
|
||||
/// segments is rejected as a non-section (price tickers, user pages, auth
|
||||
/// flows, comment threads, search). Catches Decrypt's 248-row `/price/*`
|
||||
/// ticker ribbon cheaply, plus generic chrome across many sites.
|
||||
const DENY_FIRST_SEGMENTS: &[&str] = &[
|
||||
"price",
|
||||
"prices",
|
||||
"quote",
|
||||
"quotes",
|
||||
"comments",
|
||||
"user",
|
||||
"users",
|
||||
"auth",
|
||||
"login",
|
||||
"logout",
|
||||
"register",
|
||||
"signin",
|
||||
"signup",
|
||||
"subscribe",
|
||||
"subscription",
|
||||
"share",
|
||||
"tag",
|
||||
"tags",
|
||||
"search",
|
||||
"cart",
|
||||
"checkout",
|
||||
"account",
|
||||
"profile",
|
||||
];
|
||||
|
||||
/// Maximum number of path segments a section URL may have. Section paths
|
||||
/// are 1 segment (`/sport`) or 2 (`/news/business`); article URLs are
|
||||
/// typically 3+ (`/news/articles/<id>`, `/2024/05/23/<slug>`).
|
||||
const MAX_PATH_SEGMENTS: usize = 2;
|
||||
|
||||
/// Maximum length of a single path segment. Article slugs are usually
|
||||
/// longer (`big-news-headline-about-some-topic`); section names are
|
||||
/// short (`business`, `health`, `editors-picks`).
|
||||
const MAX_SEGMENT_LEN: usize = 30;
|
||||
|
||||
/// Decide whether a path segment looks like an article ID rather than a
|
||||
/// section name. Article-ID heuristic: length >= 6 chars AND contains at
|
||||
/// least 2 ASCII digits AND mixes letters with digits. Matches BBC
|
||||
/// `crmp121z3z8o` style and CMS IDs; doesn't trip on `editors-picks` (no
|
||||
/// digits) or `2024` (all digits, no letters).
|
||||
fn looks_like_article_id(segment: &str) -> bool {
|
||||
if segment.len() < 6 {
|
||||
return false;
|
||||
}
|
||||
let mut digits = 0usize;
|
||||
let mut letters = 0usize;
|
||||
for c in segment.chars() {
|
||||
if c.is_ascii_digit() {
|
||||
digits += 1;
|
||||
} else if c.is_ascii_alphabetic() {
|
||||
letters += 1;
|
||||
}
|
||||
}
|
||||
digits >= 2 && letters >= 1
|
||||
}
|
||||
|
||||
/// Test whether a URL path is shaped like a section path.
|
||||
///
|
||||
/// Accepts:
|
||||
/// - `/` (rare — site root link, used by some "Home" nav)
|
||||
/// - `/sport`
|
||||
/// - `/news/business`
|
||||
/// - `/editors-picks`
|
||||
/// - `/news/business/` (trailing slash)
|
||||
///
|
||||
/// Rejects: 3+ segment paths, segments with article-ID shape, segments
|
||||
/// matching the denylist, segments containing non-`[a-z0-9-]` chars (case
|
||||
/// insensitive on the alpha side), segments longer than 30 chars.
|
||||
fn is_section_path(path: &str) -> bool {
|
||||
// Drop leading + trailing slash for segment count.
|
||||
let trimmed = path.trim_start_matches('/').trim_end_matches('/');
|
||||
if trimmed.is_empty() {
|
||||
// Root path "/" — treat as a section (e.g. BBC "Home" link).
|
||||
return true;
|
||||
}
|
||||
let segments: Vec<&str> = trimmed.split('/').collect();
|
||||
if segments.len() > MAX_PATH_SEGMENTS {
|
||||
return false;
|
||||
}
|
||||
for (i, seg) in segments.iter().enumerate() {
|
||||
if seg.is_empty() {
|
||||
return false;
|
||||
}
|
||||
if seg.len() > MAX_SEGMENT_LEN {
|
||||
return false;
|
||||
}
|
||||
// First-segment denylist (price ribbons, user/auth pages, search).
|
||||
if i == 0 && DENY_FIRST_SEGMENTS.contains(&seg.to_ascii_lowercase().as_str()) {
|
||||
return false;
|
||||
}
|
||||
// Article-ID-shaped segment rejection.
|
||||
if looks_like_article_id(seg) {
|
||||
return false;
|
||||
}
|
||||
// Only ASCII alpha-numeric + hyphen. Underscores, dots, digits-only
|
||||
// segments (year-paths like `/2024/`) are not sections.
|
||||
let mut has_alpha = false;
|
||||
for c in seg.chars() {
|
||||
if c.is_ascii_alphabetic() {
|
||||
has_alpha = true;
|
||||
} else if c.is_ascii_digit() || c == '-' {
|
||||
// Allowed.
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if !has_alpha {
|
||||
// Pure-digit segments (`/2024`) are not sections.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// Same-host check: section links should usually live on the page's own
|
||||
/// host (subdomains allowed). Prevents cross-domain promo nav from
|
||||
/// polluting the result. Returns true iff `link_host` equals `page_host`
|
||||
/// or is a subdomain ending in `.<page_host>`.
|
||||
fn same_host(link_host: &str, page_host: &str) -> bool {
|
||||
if link_host.eq_ignore_ascii_case(page_host) {
|
||||
return true;
|
||||
}
|
||||
// Strip leading "www." from both for the subdomain comparison so
|
||||
// `www.bbc.com` matches `bbc.com`.
|
||||
let lh = link_host.trim_start_matches("www.").to_ascii_lowercase();
|
||||
let ph = page_host.trim_start_matches("www.").to_ascii_lowercase();
|
||||
if lh == ph {
|
||||
return true;
|
||||
}
|
||||
lh.ends_with(&format!(".{ph}"))
|
||||
}
|
||||
|
||||
/// Decide whether `(label, href)` is a section link given the page URL.
|
||||
///
|
||||
/// Multi-signal AND:
|
||||
/// 1. URL parses with a scheme http/https
|
||||
/// 2. Path matches section shape (`is_section_path`)
|
||||
/// 3. No URL fragment (anchor links like `/news/world#bbc-main` rejected)
|
||||
/// 4. Same-host as the page (or subdomain)
|
||||
/// 5. Label is short (<=40 chars after cleaning) and <=5 words
|
||||
/// 6. Label is not a truncation sentinel (`...` from `clean_link_label`)
|
||||
fn is_section_link(label: &str, href: &str, page_url: Option<&Url>) -> bool {
|
||||
// Label-shape gate.
|
||||
if label.is_empty() {
|
||||
return false;
|
||||
}
|
||||
if label.contains("...") {
|
||||
// Truncated long-article-title sentinel; not a section.
|
||||
return false;
|
||||
}
|
||||
if label.chars().count() > 40 {
|
||||
return false;
|
||||
}
|
||||
if label.split_whitespace().count() > 5 {
|
||||
return false;
|
||||
}
|
||||
|
||||
// URL-shape gate.
|
||||
let url = match Url::parse(href) {
|
||||
Ok(u) => u,
|
||||
Err(_) => return false,
|
||||
};
|
||||
let scheme = url.scheme();
|
||||
if scheme != "http" && scheme != "https" {
|
||||
return false;
|
||||
}
|
||||
// Anchor / fragment exclusion — `/news/world#bbc-main` is not a section.
|
||||
if url.fragment().is_some() {
|
||||
return false;
|
||||
}
|
||||
// Query string is allowed but uncommon for section links; we don't
|
||||
// reject on its presence — many sites carry a `?source=nav` tracker.
|
||||
// The path itself must be section-shaped.
|
||||
if !is_section_path(url.path()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Same-host gate. If we don't know the page URL, fall through.
|
||||
if let Some(page) = page_url
|
||||
&& let (Some(lh), Some(ph)) = (url.host_str(), page.host_str())
|
||||
&& !same_host(lh, ph)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Public surface — collectors and formatters
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Collect a deduplicated (label, url) list of section links for the
|
||||
/// page. Reuses the noise-filtered link list `body::process_body`
|
||||
/// produces; applies the M8 section heuristic on top.
|
||||
///
|
||||
/// `page_url` is the canonical URL of the page (used for the same-host
|
||||
/// gate). When `None`, the same-host gate is skipped.
|
||||
pub fn collect_section_links(
|
||||
result: &ExtractionResult,
|
||||
page_url: Option<&str>,
|
||||
) -> Vec<(String, String)> {
|
||||
let parsed_page = page_url.and_then(|u| Url::parse(u).ok());
|
||||
let processed = body::process_body(&result.content.markdown);
|
||||
let mut out: Vec<(String, String)> = Vec::new();
|
||||
let mut seen_hrefs: std::collections::HashSet<String> = std::collections::HashSet::new();
|
||||
for (text, href) in processed.links {
|
||||
let label = links::clean_link_label(&text);
|
||||
if !is_section_link(&label, &href, parsed_page.as_ref()) {
|
||||
continue;
|
||||
}
|
||||
if !seen_hrefs.insert(href.clone()) {
|
||||
continue;
|
||||
}
|
||||
out.push((label, href));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// `-f llm` / `-f text` form: metadata header (Status line suppressed)
|
||||
/// followed by a `## Sections` block of `- [Label](url)` lines.
|
||||
///
|
||||
/// When the heuristic returns 0 sections, emits the header plus
|
||||
/// `## Sections\n_(no sections detected)_` so the caller can
|
||||
/// distinguish empty-result from a crash / parse failure.
|
||||
pub fn to_llm_sections(result: &ExtractionResult, url: Option<&str>) -> String {
|
||||
let sections = collect_section_links(result, url);
|
||||
let mut out = String::new();
|
||||
// M7 suppression: section listing is conceptually navigation, not
|
||||
// protocol-level outcome.
|
||||
build_metadata_header_with_opts(&mut out, result, url, false);
|
||||
if !out.is_empty() {
|
||||
out.push('\n');
|
||||
}
|
||||
out.push_str("## Sections\n");
|
||||
if sections.is_empty() {
|
||||
out.push_str("_(no sections detected)_");
|
||||
} else {
|
||||
for (label, href) in §ions {
|
||||
out.push_str(&format!("- [{label}]({href})\n"));
|
||||
}
|
||||
}
|
||||
out.trim_end().to_string()
|
||||
}
|
||||
|
||||
/// `-f json` form: `{"sections": [{"label": ..., "url": ...}, ...]}`.
|
||||
/// When 0 sections detected, `sections` is an empty array.
|
||||
pub fn to_json_sections(result: &ExtractionResult, url: Option<&str>) -> String {
|
||||
let sections = collect_section_links(result, url);
|
||||
let arr: Vec<serde_json::Value> = sections
|
||||
.into_iter()
|
||||
.map(|(label, href)| {
|
||||
serde_json::json!({
|
||||
"label": label,
|
||||
"url": href,
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
serde_json::to_string_pretty(&serde_json::json!({"sections": arr}))
|
||||
.unwrap_or_else(|_| "{\"sections\": []}".to_string())
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::types::{Content, ExtractionResult, Metadata};
|
||||
|
||||
fn make_result(markdown: &str) -> ExtractionResult {
|
||||
ExtractionResult {
|
||||
metadata: Metadata {
|
||||
title: Some("Test Page".to_string()),
|
||||
description: None,
|
||||
author: None,
|
||||
published_date: None,
|
||||
language: None,
|
||||
url: Some("https://example.com/".to_string()),
|
||||
site_name: None,
|
||||
image: None,
|
||||
favicon: None,
|
||||
word_count: 0,
|
||||
http_status: None,
|
||||
},
|
||||
content: Content {
|
||||
markdown: markdown.to_string(),
|
||||
plain_text: String::new(),
|
||||
links: Vec::new(),
|
||||
images: Vec::new(),
|
||||
code_blocks: Vec::new(),
|
||||
raw_html: None,
|
||||
},
|
||||
domain_data: None,
|
||||
structured_data: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
// -- heuristic primitives --
|
||||
|
||||
#[test]
|
||||
fn test_section_filter_detects_url_pattern_sections() {
|
||||
// 5 section-shaped URLs (BBC-style) + 15 article URLs.
|
||||
let mut md = String::from("# Page\n\n");
|
||||
// 5 section nav links.
|
||||
md.push_str("- [Home](https://www.bbc.com/)\n");
|
||||
md.push_str("- [Sport](https://www.bbc.com/sport)\n");
|
||||
md.push_str("- [Health](https://www.bbc.com/health)\n");
|
||||
md.push_str("- [Weather](https://www.bbc.com/weather)\n");
|
||||
md.push_str("- [Newsletters](https://www.bbc.com/newsletters)\n");
|
||||
// 15 article URLs (3-segment, article-ID shape).
|
||||
for i in 0..15 {
|
||||
md.push_str(&format!(
|
||||
"- [Some long headline number {i}](https://www.bbc.com/news/articles/crmp121z3z{i:01x}o)\n"
|
||||
));
|
||||
}
|
||||
let r = make_result(&md);
|
||||
let out = collect_section_links(&r, Some("https://www.bbc.com/news/world"));
|
||||
assert_eq!(out.len(), 5, "expected 5 sections, got {}: {out:?}", out.len());
|
||||
let labels: Vec<&str> = out.iter().map(|(l, _)| l.as_str()).collect();
|
||||
assert!(labels.contains(&"Sport"), "missing Sport: {labels:?}");
|
||||
assert!(labels.contains(&"Health"), "missing Health: {labels:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_section_filter_repetition_signal() {
|
||||
// After-dedup behavior: even when a section URL is referenced multiple
|
||||
// times in the source markdown, the output contains exactly one entry
|
||||
// per unique href. (Phase A: repetition is collapsed upstream by
|
||||
// process_body; we verify the final list is deduped.)
|
||||
let md = "# Page\n\n\
|
||||
- [Sport](https://www.bbc.com/sport)\n\
|
||||
- [Sport (top)](https://www.bbc.com/sport)\n\
|
||||
- [Sport (footer)](https://www.bbc.com/sport)\n\
|
||||
- [Unique](https://www.bbc.com/health)\n";
|
||||
let r = make_result(md);
|
||||
let out = collect_section_links(&r, Some("https://www.bbc.com/"));
|
||||
assert_eq!(out.len(), 2, "expected 2 unique sections, got {out:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_section_filter_combined_signals() {
|
||||
// Mix sections, article slugs, denylisted paths, cross-host, anchor links.
|
||||
let md = "# Decrypt-style\n\n\
|
||||
- [Business](https://decrypt.co/news/business)\n\
|
||||
- [Markets](https://decrypt.co/news/markets)\n\
|
||||
- [Editors' Picks](https://decrypt.co/news/editors-picks)\n\
|
||||
- [Bitcoin](https://decrypt.co/price/bitcoin)\n\
|
||||
- [Ethereum](https://decrypt.co/price/ethereum)\n\
|
||||
- [Search](https://decrypt.co/search)\n\
|
||||
- [Login](https://decrypt.co/login)\n\
|
||||
- [Cross-host](https://promo.elsewhere.com/sport)\n\
|
||||
- [Skip to content](https://decrypt.co/news/world#main)\n\
|
||||
- [Long article slug here that exceeds limit](https://decrypt.co/news/business/2024/05/some-article)\n";
|
||||
let r = make_result(md);
|
||||
let out = collect_section_links(&r, Some("https://decrypt.co/"));
|
||||
// Only Business, Markets, Editors' Picks should pass.
|
||||
assert_eq!(out.len(), 3, "expected 3 sections, got {out:?}");
|
||||
let hrefs: Vec<&str> = out.iter().map(|(_, h)| h.as_str()).collect();
|
||||
assert!(hrefs.contains(&"https://decrypt.co/news/business"));
|
||||
assert!(hrefs.contains(&"https://decrypt.co/news/markets"));
|
||||
assert!(hrefs.contains(&"https://decrypt.co/news/editors-picks"));
|
||||
// Explicitly NOT present.
|
||||
for bad in [
|
||||
"https://decrypt.co/price/bitcoin",
|
||||
"https://decrypt.co/search",
|
||||
"https://decrypt.co/login",
|
||||
"https://promo.elsewhere.com/sport",
|
||||
] {
|
||||
assert!(!hrefs.contains(&bad), "{bad} should have been filtered out: {hrefs:?}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_article_slug_excluded() {
|
||||
// BBC article-id style segment.
|
||||
let md = "- [Headline text](https://www.bbc.com/news/articles/crmp121z3z8o)\n";
|
||||
let r = make_result(md);
|
||||
let out = collect_section_links(&r, Some("https://www.bbc.com/news/world"));
|
||||
assert!(out.is_empty(), "article-ID link should have been dropped: {out:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cross_host_link_dropped() {
|
||||
let md = "- [Sport](https://promo.bbc.co.uk/sport)\n";
|
||||
let r = make_result(md);
|
||||
let out = collect_section_links(&r, Some("https://www.bbc.com/news/world"));
|
||||
assert!(out.is_empty(), "cross-host link should have been dropped: {out:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_subdomain_link_kept() {
|
||||
// news.bbc.com is a subdomain of bbc.com — same_host should accept it.
|
||||
let md = "- [Sport](https://news.bbc.com/sport)\n";
|
||||
let r = make_result(md);
|
||||
let out = collect_section_links(&r, Some("https://www.bbc.com/news/world"));
|
||||
assert_eq!(out.len(), 1, "subdomain link should have passed: {out:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_anchor_fragment_dropped() {
|
||||
let md = "- [Skip to content](https://www.bbc.com/news/world#bbc-main)\n";
|
||||
let r = make_result(md);
|
||||
let out = collect_section_links(&r, Some("https://www.bbc.com/news/world"));
|
||||
assert!(out.is_empty(), "fragment link should have been dropped: {out:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_links_returns_empty() {
|
||||
let md = "# Just a heading\n\nNo links at all here.";
|
||||
let r = make_result(md);
|
||||
let out = collect_section_links(&r, Some("https://example.com/"));
|
||||
assert!(out.is_empty(), "expected empty: {out:?}");
|
||||
}
|
||||
|
||||
// -- formatter tests --
|
||||
|
||||
#[test]
|
||||
fn test_sections_mode_formats_llm_output() {
|
||||
let md = "- [Sport](https://www.bbc.com/sport)\n- [Health](https://www.bbc.com/health)\n";
|
||||
let mut r = make_result(md);
|
||||
r.metadata.url = Some("https://www.bbc.com/news/world".to_string());
|
||||
let out = to_llm_sections(&r, Some("https://www.bbc.com/news/world"));
|
||||
assert!(out.contains("## Sections"), "missing Sections header: {out}");
|
||||
assert!(out.contains("- [Sport](https://www.bbc.com/sport)"), "missing Sport: {out}");
|
||||
assert!(out.contains("- [Health](https://www.bbc.com/health)"), "missing Health: {out}");
|
||||
// Metadata header URL present, Status line absent (Sections mode passes include_status=false).
|
||||
assert!(out.contains("> URL:"));
|
||||
assert!(!out.contains("> Status:"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sections_mode_formats_json_output() {
|
||||
let md = "- [Sport](https://www.bbc.com/sport)\n- [Health](https://www.bbc.com/health)\n";
|
||||
let r = make_result(md);
|
||||
let s = to_json_sections(&r, Some("https://www.bbc.com/news/world"));
|
||||
let v: serde_json::Value = serde_json::from_str(&s).expect("valid JSON");
|
||||
let arr = v["sections"].as_array().expect("sections array present");
|
||||
assert_eq!(arr.len(), 2);
|
||||
assert_eq!(arr[0]["label"].as_str().unwrap(), "Sport");
|
||||
assert_eq!(arr[0]["url"].as_str().unwrap(), "https://www.bbc.com/sport");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sections_mode_fallback_on_no_nav() {
|
||||
// Phase A's chosen fallback: empty list with `_(no sections detected)_`
|
||||
// marker in -f llm form, and `{"sections": []}` in -f json form.
|
||||
let md = "# Page\n\nNo links here.";
|
||||
let r = make_result(md);
|
||||
let llm = to_llm_sections(&r, Some("https://example.com/"));
|
||||
assert!(llm.contains("## Sections"), "missing header: {llm}");
|
||||
assert!(llm.contains("(no sections detected)"), "missing fallback marker: {llm}");
|
||||
let json = to_json_sections(&r, Some("https://example.com/"));
|
||||
let v: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
|
||||
assert_eq!(v["sections"].as_array().unwrap().len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_status_header_suppressed_in_sections_mode() {
|
||||
// Parallel to summary/toc behavior — Sections mode passes
|
||||
// include_status=false to build_metadata_header_with_opts.
|
||||
let mut r = make_result("- [Sport](https://www.bbc.com/sport)\n");
|
||||
r.metadata.http_status = Some(404);
|
||||
let out = to_llm_sections(&r, Some("https://www.bbc.com/news/world"));
|
||||
assert!(
|
||||
!out.contains("> Status:"),
|
||||
"Status line leaked into sections mode output:\n{out}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_page_url_skips_same_host_gate() {
|
||||
// When page_url is None we don't know the host; the link still
|
||||
// passes provided its URL shape is section-like.
|
||||
let md = "- [Sport](https://www.bbc.com/sport)\n";
|
||||
let r = make_result(md);
|
||||
let out = collect_section_links(&r, None);
|
||||
assert_eq!(out.len(), 1, "expected 1 section, got {out:?}");
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue