feat(cli): add --max-output-bytes and --mode summary,toc for output-size control

Three additive CLI flags addressing the 50KB persisted-output cap that trips Claude Code's per-tool-result harness on aggregator front pages (apnews.com, cnbc.com/markets/, b92.net all >50KB by default): --max-output-bytes N: truncates final output at N bytes with a clear '[truncated: M more bytes ...]' footer. N=0 means unlimited (default). UTF-8 codepoint-boundary safe; also wraps JSON output so truncated output stays parseable. --mode summary: returns only the extracted link list (titles + URLs), no body text. For aggregator front pages where the LLM is going to drill the individual articles next anyway. --mode toc: returns H1/H2 outline + first paragraph after each H2. For long single-article pages. New flags are orthogonal to -f (json/llm/text). 9 new unit tests in webclaw-core, total goes 308 -> 317 passing. Smoke-tested on apnews.com (51713 -> 27404 summary -> 6269 toc -> 8193 capped), pitchfork.com (42049 -> 379 summary), cnbc.com (56682 -> 16385 capped).
2026-06-11 22:55:13 +02:00 · 2026-05-23 18:17:42 +02:00 · 2026-05-23 18:17:42 +02:00 · 339f41bb7c
commit 339f41bb7c
parent 562c6a15f0
4 changed files with 756 additions and 54 deletions
--- a/crates/webclaw-cli/src/main.rs
+++ b/crates/webclaw-cli/src/main.rs
@ -170,6 +170,16 @@ struct Cli {
    #[arg(short, long, default_value = "markdown")]
    format: OutputFormat,

+    /// Output mode: full (default), summary (link list), or toc (H1/H2 outline + first paragraph).
+    /// Orthogonal to --format; e.g. `-f json --mode summary` returns a JSON link array.
+    #[arg(long, default_value = "full")]
+    mode: OutputMode,
+
+    /// Cap the final output at N bytes; on overflow truncate at a UTF-8 boundary
+    /// and append a [truncated: N more bytes ...] footer. 0 = no cap (default).
+    #[arg(long, default_value = "0")]
+    max_output_bytes: u64,
+
    /// Browser to impersonate
    #[arg(short, long, default_value = "chrome")]
    browser: Browser,
@ -413,6 +423,17 @@ enum OutputFormat {
    Html,
 }

+/// Output mode. `full` is the default and matches the historical
+/// behaviour; `summary` returns just the navigation/link list; `toc`
+/// returns the H1/H2 outline plus the first paragraph after each H2.
+/// Orthogonal to `--format`.
+#[derive(Clone, ValueEnum, PartialEq, Eq)]
+enum OutputMode {
+    Full,
+    Summary,
+    Toc,
+}
+
 #[derive(Clone, ValueEnum)]
 enum Browser {
    Chrome,
@ -719,26 +740,80 @@ fn raw_html_or_markdown(result: &ExtractionResult) -> &str {

 /// Format an `ExtractionResult` into a string for the given output format.
 fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) -> String {
+    format_output_with_mode(result, format, show_metadata, &OutputMode::Full, 0)
+}
+
+/// Format an `ExtractionResult` for the given format and mode, then apply
+/// the byte cap. Returns the final string ready for stdout / disk.
+///
+/// `mode == Full` reproduces the legacy behaviour exactly.
+/// `mode == Summary` returns just the link list (text-formats) or a JSON
+/// array of `{title, url}` (json format).
+/// `mode == Toc` returns an H1/H2 outline + first paragraph after each H2.
+///
+/// `max_output_bytes == 0` disables the cap. Otherwise the output is
+/// truncated at a UTF-8 boundary with a `[truncated: ...]` footer
+/// (or a `_truncated` wrapper for JSON, so the document stays parseable).
+fn format_output_with_mode(
+    result: &ExtractionResult,
+    format: &OutputFormat,
+    show_metadata: bool,
+    mode: &OutputMode,
+    max_output_bytes: u64,
+) -> String {
+    let body = render_body(result, format, show_metadata, mode);
+    apply_byte_cap(&body, format, max_output_bytes)
+}
+
+fn render_body(
+    result: &ExtractionResult,
+    format: &OutputFormat,
+    show_metadata: bool,
+    mode: &OutputMode,
+) -> String {
+    match mode {
+        OutputMode::Summary => match format {
+            OutputFormat::Json => webclaw_core::to_json_summary(result),
+            _ => webclaw_core::to_llm_summary(result, result.metadata.url.as_deref()),
+        },
+        OutputMode::Toc => match format {
+            OutputFormat::Json => webclaw_core::to_json_toc(result),
+            _ => webclaw_core::to_llm_toc(result, result.metadata.url.as_deref()),
+        },
+        OutputMode::Full => match format {
+            OutputFormat::Markdown => {
+                let mut out = String::new();
+                if show_metadata {
+                    out.push_str(&format_frontmatter(&result.metadata));
+                }
+                out.push_str(&result.content.markdown);
+                if !result.structured_data.is_empty() {
+                    out.push_str("\n\n## Structured Data\n\n```json\n");
+                    out.push_str(
+                        &serde_json::to_string_pretty(&result.structured_data).unwrap_or_default(),
+                    );
+                    out.push_str("\n```");
+                }
+                out
+            }
+            OutputFormat::Json => {
+                serde_json::to_string_pretty(result).expect("serialization failed")
+            }
+            OutputFormat::Text => result.content.plain_text.clone(),
+            OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
+            OutputFormat::Html => raw_html_or_markdown(result).to_string(),
+        },
+    }
+}
+
+fn apply_byte_cap(body: &str, format: &OutputFormat, cap: u64) -> String {
+    if cap == 0 {
+        return body.to_string();
+    }
+    let cap = cap as usize;
    match format {
-        OutputFormat::Markdown => {
-            let mut out = String::new();
-            if show_metadata {
-                out.push_str(&format_frontmatter(&result.metadata));
-            }
-            out.push_str(&result.content.markdown);
-            if !result.structured_data.is_empty() {
-                out.push_str("\n\n## Structured Data\n\n```json\n");
-                out.push_str(
-                    &serde_json::to_string_pretty(&result.structured_data).unwrap_or_default(),
-                );
-                out.push_str("\n```");
-            }
-            out
-        }
-        OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"),
-        OutputFormat::Text => result.content.plain_text.clone(),
-        OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
-        OutputFormat::Html => raw_html_or_markdown(result).to_string(),
+        OutputFormat::Json => webclaw_core::truncate_json_with_wrapper(body, cap),
+        _ => webclaw_core::truncate_with_footer(body, cap),
    }
 }

@ -1036,37 +1111,15 @@ fn format_frontmatter(meta: &Metadata) -> String {
    lines.join("\n")
 }

-fn print_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) {
-    match format {
-        OutputFormat::Markdown => {
-            if show_metadata {
-                print!("{}", format_frontmatter(&result.metadata));
-            }
-            println!("{}", result.content.markdown);
-            if !result.structured_data.is_empty() {
-                println!(
-                    "\n## Structured Data\n\n```json\n{}\n```",
-                    serde_json::to_string_pretty(&result.structured_data).unwrap_or_default()
-                );
-            }
-        }
-        OutputFormat::Json => {
-            // serde_json::to_string_pretty won't fail on our types
-            println!(
-                "{}",
-                serde_json::to_string_pretty(result).expect("serialization failed")
-            );
-        }
-        OutputFormat::Text => {
-            println!("{}", result.content.plain_text);
-        }
-        OutputFormat::Llm => {
-            println!("{}", to_llm_text(result, result.metadata.url.as_deref()));
-        }
-        OutputFormat::Html => {
-            println!("{}", raw_html_or_markdown(result));
-        }
-    }
+fn print_output_with_mode(
+    result: &ExtractionResult,
+    format: &OutputFormat,
+    show_metadata: bool,
+    mode: &OutputMode,
+    max_output_bytes: u64,
+) {
+    let out = format_output_with_mode(result, format, show_metadata, mode, max_output_bytes);
+    println!("{out}");
 }

 /// Print cloud API response in the requested format.
@ -1132,6 +1185,53 @@ fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) {
    }
 }

+/// Render the cloud response into a string per `format`, then apply
+/// `--max-output-bytes` if non-zero. Mirrors `print_cloud_output` exactly
+/// when `cap == 0`.
+fn print_cloud_output_capped(resp: &serde_json::Value, format: &OutputFormat, cap: u64) {
+    if cap == 0 {
+        print_cloud_output(resp, format);
+        return;
+    }
+    let body = render_cloud_body(resp, format);
+    println!("{}", apply_byte_cap(&body, format, cap));
+}
+
+fn render_cloud_body(resp: &serde_json::Value, format: &OutputFormat) -> String {
+    match format {
+        OutputFormat::Json => {
+            serde_json::to_string_pretty(resp).expect("serialization failed")
+        }
+        OutputFormat::Markdown => resp
+            .get("content")
+            .and_then(|c| c.get("markdown"))
+            .and_then(|m| m.as_str())
+            .map(|s| s.to_string())
+            .or_else(|| resp.get("markdown").and_then(|m| m.as_str()).map(|s| s.to_string()))
+            .unwrap_or_else(|| {
+                serde_json::to_string_pretty(resp).expect("serialization failed")
+            }),
+        OutputFormat::Text => resp
+            .get("content")
+            .and_then(|c| c.get("plain_text"))
+            .and_then(|t| t.as_str())
+            .map(|s| s.to_string())
+            .unwrap_or_else(|| render_cloud_body(resp, &OutputFormat::Markdown)),
+        OutputFormat::Llm => resp
+            .get("content")
+            .and_then(|c| c.get("llm_text"))
+            .and_then(|t| t.as_str())
+            .map(|s| s.to_string())
+            .unwrap_or_else(|| render_cloud_body(resp, &OutputFormat::Markdown)),
+        OutputFormat::Html => resp
+            .get("content")
+            .and_then(|c| c.get("raw_html"))
+            .and_then(|h| h.as_str())
+            .map(|s| s.to_string())
+            .unwrap_or_else(|| render_cloud_body(resp, &OutputFormat::Markdown)),
+    }
+}
+
 fn print_diff_output(diff: &ContentDiff, format: &OutputFormat) {
    match format {
        OutputFormat::Json => {
@ -2662,17 +2762,33 @@ async fn main() {
                    .unwrap_or_default();
                let custom_name = entries.first().and_then(|(_, name)| name.clone());
                let filename = custom_name.unwrap_or_else(|| url_to_filename(&url, &cli.format));
-                let content = format_output(&result, &cli.format, cli.metadata);
+                let content = format_output_with_mode(
+                    &result,
+                    &cli.format,
+                    cli.metadata,
+                    &cli.mode,
+                    cli.max_output_bytes,
+                );
                if let Err(e) = write_to_file(dir, &filename, &content) {
                    eprintln!("error: {e}");
                    process::exit(1);
                }
            } else {
-                print_output(&result, &cli.format, cli.metadata);
+                print_output_with_mode(
+                    &result,
+                    &cli.format,
+                    cli.metadata,
+                    &cli.mode,
+                    cli.max_output_bytes,
+                );
            }
        }
        Ok(FetchOutput::Cloud(resp)) => {
-            print_cloud_output(&resp, &cli.format);
+            // Cloud path does not yet have a structured ExtractionResult,
+            // so --mode summary/toc can't be applied here. We still apply
+            // the byte cap to the rendered cloud output by routing through
+            // a helper that prints to a buffer first.
+            print_cloud_output_capped(&resp, &cli.format, cli.max_output_bytes);
        }
        Err(e) => {
            eprintln!("{e}");
--- a/crates/webclaw-core/src/lib.rs
+++ b/crates/webclaw-core/src/lib.rs
@ -25,7 +25,10 @@ pub use brand::BrandIdentity;
 pub use diff::{ChangeStatus, ContentDiff, MetadataChange};
 pub use domain::DomainType;
 pub use error::ExtractError;
-pub use llm::to_llm_text;
+pub use llm::{
+    to_json_summary, to_json_toc, to_llm_summary, to_llm_text, to_llm_toc,
+    truncate_json_with_wrapper, truncate_with_footer,
+};
 pub use types::{
    CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata,
 };
--- a/crates/webclaw-core/src/llm/mod.rs
+++ b/crates/webclaw-core/src/llm/mod.rs
@ -9,6 +9,12 @@ mod cleanup;
 mod images;
 mod links;
 mod metadata;
+mod output_size;
+
+pub use output_size::{
+    to_json_summary, to_json_toc, to_llm_summary, to_llm_toc, truncate_json_with_wrapper,
+    truncate_with_footer,
+};

 use crate::types::ExtractionResult;

--- a/crates/webclaw-core/src/llm/output_size.rs
+++ b/crates/webclaw-core/src/llm/output_size.rs
@ -0,0 +1,577 @@
+/// Output-size control: alternate output modes (summary, toc) plus
+/// post-format byte-cap truncation with a clear footer.
+///
+/// Three orthogonal axes:
+///   - `OutputMode` (full | summary | toc) selects what to emit
+///   - `OutputFormat` (text/markdown vs json) is owned by the caller
+///   - `max_output_bytes` caps the FINAL byte count after format emission
+///
+/// `summary` returns a navigation/link list extracted from the page.
+/// `toc` returns the H1/H2 outline plus the first paragraph after each H2.
+/// `truncate_with_footer` walks UTF-8 codepoint boundaries so it never
+/// produces an invalid UTF-8 split.
+use crate::types::ExtractionResult;
+
+use super::body;
+use super::links;
+use super::metadata::build_metadata_header;
+
+// ---------------------------------------------------------------------------
+// Summary mode — link/title list, no body
+// ---------------------------------------------------------------------------
+
+/// Build a markdown link list (`- [Title](URL)`) of all non-noise links on
+/// the page. Includes the metadata header so callers can still see what
+/// page the summary came from.
+pub fn to_llm_summary(result: &ExtractionResult, url: Option<&str>) -> String {
+    let links = collect_summary_links(result);
+    let mut out = String::new();
+    build_metadata_header(&mut out, result, url);
+    if !out.is_empty() {
+        out.push('\n');
+    }
+    out.push_str("## Links\n");
+    for (label, href) in &links {
+        out.push_str(&format!("- [{label}]({href})\n"));
+    }
+    out.trim_end().to_string()
+}
+
+/// JSON form of the summary: an array of `{"title": ..., "url": ...}`.
+pub fn to_json_summary(result: &ExtractionResult) -> String {
+    let links = collect_summary_links(result);
+    let arr: Vec<serde_json::Value> = links
+        .into_iter()
+        .map(|(title, url)| {
+            serde_json::json!({
+                "title": title,
+                "url": url,
+            })
+        })
+        .collect();
+    serde_json::to_string_pretty(&arr).unwrap_or_else(|_| "[]".to_string())
+}
+
+/// Collect a deduplicated (label, href) list from the page, reusing the
+/// same noise-filter the main LLM output uses so summary stays consistent
+/// with the existing extraction.
+fn collect_summary_links(result: &ExtractionResult) -> Vec<(String, String)> {
+    // Run the existing body pipeline; it already produces a clean, deduped
+    // (label, href) list with noise links filtered out.
+    let processed = body::process_body(&result.content.markdown);
+    let mut out: Vec<(String, String)> = Vec::with_capacity(processed.links.len());
+    for (text, href) in processed.links {
+        let label = links::clean_link_label(&text);
+        if label.is_empty() {
+            continue;
+        }
+        out.push((label, href));
+    }
+    out
+}
+
+// ---------------------------------------------------------------------------
+// TOC mode — H1/H2 outline + first paragraph after each H2
+// ---------------------------------------------------------------------------
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub(crate) struct TocEntry {
+    pub level: u8,
+    pub heading: String,
+    pub intro: String,
+}
+
+/// Build a markdown outline from the processed body. Each H1 / H2 is
+/// emitted as a heading line; the first non-empty, non-heading paragraph
+/// immediately after an H2 is emitted as its `intro`.
+pub fn to_llm_toc(result: &ExtractionResult, url: Option<&str>) -> String {
+    let entries = collect_toc_entries(result);
+
+    let mut out = String::new();
+    build_metadata_header(&mut out, result, url);
+    if !out.is_empty() {
+        out.push('\n');
+    }
+
+    for entry in &entries {
+        let hashes = "#".repeat(entry.level as usize);
+        out.push_str(&format!("{hashes} {}\n", entry.heading));
+        if !entry.intro.is_empty() {
+            out.push('\n');
+            out.push_str(&entry.intro);
+            out.push_str("\n\n");
+        } else {
+            out.push('\n');
+        }
+    }
+
+    out.trim_end().to_string()
+}
+
+/// JSON form of the TOC: an array of `{"level": N, "heading": ..., "intro": ...}`.
+pub fn to_json_toc(result: &ExtractionResult) -> String {
+    let entries = collect_toc_entries(result);
+    let arr: Vec<serde_json::Value> = entries
+        .into_iter()
+        .map(|e| {
+            serde_json::json!({
+                "level": e.level,
+                "heading": e.heading,
+                "intro": e.intro,
+            })
+        })
+        .collect();
+    serde_json::to_string_pretty(&arr).unwrap_or_else(|_| "[]".to_string())
+}
+
+/// Walk the processed body text, pulling out H1/H2 headings and the first
+/// paragraph that follows each H2.
+pub(crate) fn collect_toc_entries(result: &ExtractionResult) -> Vec<TocEntry> {
+    let processed = body::process_body(&result.content.markdown);
+    let text = &processed.text;
+
+    let mut entries: Vec<TocEntry> = Vec::new();
+    let mut current_h2_idx: Option<usize> = None;
+    let mut paragraph: String = String::new();
+    let mut in_paragraph = false;
+
+    let flush_paragraph =
+        |paragraph: &mut String, in_paragraph: &mut bool, current_h2_idx: &mut Option<usize>, entries: &mut Vec<TocEntry>| {
+            if *in_paragraph {
+                let trimmed = paragraph.trim().to_string();
+                if !trimmed.is_empty()
+                    && let Some(idx) = *current_h2_idx
+                    && entries[idx].intro.is_empty()
+                {
+                    entries[idx].intro = trimmed;
+                    *current_h2_idx = None;
+                }
+                paragraph.clear();
+                *in_paragraph = false;
+            }
+        };
+
+    for line in text.lines() {
+        let trimmed = line.trim_start();
+        if let Some(rest) = trimmed.strip_prefix("# ") {
+            flush_paragraph(&mut paragraph, &mut in_paragraph, &mut current_h2_idx, &mut entries);
+            entries.push(TocEntry {
+                level: 1,
+                heading: rest.trim().to_string(),
+                intro: String::new(),
+            });
+            current_h2_idx = None;
+        } else if let Some(rest) = trimmed.strip_prefix("## ") {
+            flush_paragraph(&mut paragraph, &mut in_paragraph, &mut current_h2_idx, &mut entries);
+            entries.push(TocEntry {
+                level: 2,
+                heading: rest.trim().to_string(),
+                intro: String::new(),
+            });
+            current_h2_idx = Some(entries.len() - 1);
+        } else if trimmed.starts_with("#") {
+            // H3+ — ignore for the outline, but ends any in-progress intro paragraph.
+            flush_paragraph(&mut paragraph, &mut in_paragraph, &mut current_h2_idx, &mut entries);
+        } else if trimmed.is_empty() {
+            flush_paragraph(&mut paragraph, &mut in_paragraph, &mut current_h2_idx, &mut entries);
+        } else {
+            // Body text. Only collect intros for the most-recent H2 with no intro yet.
+            if let Some(idx) = current_h2_idx
+                && entries[idx].intro.is_empty()
+            {
+                if in_paragraph {
+                    paragraph.push(' ');
+                }
+                paragraph.push_str(trimmed);
+                in_paragraph = true;
+            }
+        }
+    }
+    // End-of-text flush
+    flush_paragraph(&mut paragraph, &mut in_paragraph, &mut current_h2_idx, &mut entries);
+
+    entries
+}
+
+// ---------------------------------------------------------------------------
+// Byte-cap truncation
+// ---------------------------------------------------------------------------
+
+/// Truncate `s` so the returned string is at most `cap` bytes long,
+/// honoring UTF-8 codepoint boundaries and appending a footer that names
+/// how many bytes were dropped.
+///
+/// - `cap == 0` is treated as "no cap" — returns `s` unchanged.
+/// - If `s.len() <= cap`, no footer is appended.
+/// - When truncation happens, the FOOTER is included inside the cap:
+///   the kept-body bytes + footer bytes never exceed `cap` (best-effort —
+///   if `cap` is smaller than the footer itself, the body is empty and
+///   the footer alone is returned, possibly slightly over cap; this only
+///   happens for absurdly small caps like `--max-output-bytes 50`).
+pub fn truncate_with_footer(s: &str, cap: usize) -> String {
+    if cap == 0 {
+        return s.to_string();
+    }
+    let original_bytes = s.len();
+    if original_bytes <= cap {
+        return s.to_string();
+    }
+
+    // First pass: build a placeholder footer to learn its byte length.
+    // We don't yet know `kept` (depends on cap minus footer), so we use
+    // a worst-case estimate for the byte counts and rebuild once. Two
+    // passes is fine and avoids fixed-point loops.
+    let placeholder_footer = build_footer(original_bytes, original_bytes, original_bytes);
+    let footer_max_len = placeholder_footer.len();
+    // Reserve room for the footer + a separator newline. Without the
+    // explicit '+1', the body can end mid-text and the inserted '\n'
+    // before the footer pushes us 1 byte over the cap.
+    let body_budget = cap.saturating_sub(footer_max_len).saturating_sub(1);
+
+    // Walk to the largest codepoint boundary <= body_budget.
+    let mut kept_bytes = 0usize;
+    for (i, _) in s.char_indices() {
+        if i > body_budget {
+            break;
+        }
+        kept_bytes = i;
+    }
+    // If body_budget falls past end-of-string somehow, clamp.
+    if kept_bytes > original_bytes {
+        kept_bytes = original_bytes;
+    }
+
+    let dropped_bytes = original_bytes - kept_bytes;
+    let footer = build_footer(original_bytes, dropped_bytes, kept_bytes);
+
+    let mut out = String::with_capacity(kept_bytes + footer.len() + 1);
+    out.push_str(&s[..kept_bytes]);
+    // Make sure the footer starts on its own line if the body didn't end with one.
+    if !out.is_empty() && !out.ends_with('\n') {
+        out.push('\n');
+    }
+    out.push_str(&footer);
+    out
+}
+
+fn build_footer(original_bytes: usize, dropped_bytes: usize, _kept_bytes: usize) -> String {
+    format!(
+        "[truncated: {dropped_bytes} more bytes — original output was {original_bytes} bytes; pass --max-output-bytes 0 to disable, or increase the cap]\n"
+    )
+}
+
+/// JSON-aware truncation: when a JSON document is too large, we don't
+/// truncate the JSON itself (that would produce invalid syntax). Instead
+/// we emit a wrapper object that names the truncation and embeds a
+/// best-effort string prefix of the original JSON.
+///
+/// This is what `--max-output-bytes N -f json` returns when the rendered
+/// JSON would exceed N bytes.
+pub fn truncate_json_with_wrapper(s: &str, cap: usize) -> String {
+    if cap == 0 {
+        return s.to_string();
+    }
+    let original_bytes = s.len();
+    if original_bytes <= cap {
+        return s.to_string();
+    }
+
+    // Build the wrapper skeleton first to learn its overhead, then size
+    // the embedded `data` slice to fit under the cap. We escape it as a
+    // JSON string so the document stays valid.
+    let wrapper = |kept_bytes: usize, data_escaped: &str| -> String {
+        serde_json::json!({
+            "_truncated": true,
+            "_original_bytes": original_bytes,
+            "_truncated_bytes": original_bytes - kept_bytes,
+            "_note": "pass --max-output-bytes 0 to disable, or increase the cap",
+            "data": data_escaped,
+        })
+        .to_string()
+    };
+
+    // Estimate overhead with an empty data string.
+    let overhead = wrapper(0, "").len();
+    // Each character of data may take up to 6 bytes when escaped (\uXXXX),
+    // but ASCII typically takes 1 — we conservatively budget for 2× growth
+    // and iterate down if we overshoot.
+    let mut body_budget = cap.saturating_sub(overhead).saturating_sub(8) / 2;
+    if body_budget == 0 {
+        body_budget = 1;
+    }
+
+    loop {
+        // Walk to the largest codepoint boundary <= body_budget.
+        let mut kept_bytes = 0usize;
+        for (i, _) in s.char_indices() {
+            if i > body_budget {
+                break;
+            }
+            kept_bytes = i;
+        }
+        if kept_bytes > original_bytes {
+            kept_bytes = original_bytes;
+        }
+        let escaped = serde_json::to_string(&s[..kept_bytes]).unwrap_or_else(|_| "\"\"".to_string());
+        // Strip outer quotes from the escaped string for embedding.
+        let inner = if escaped.len() >= 2 {
+            &escaped[1..escaped.len() - 1]
+        } else {
+            ""
+        };
+        let candidate = wrapper(kept_bytes, inner);
+        if candidate.len() <= cap || body_budget <= 1 {
+            return candidate;
+        }
+        // Overshoot — shrink body_budget and retry.
+        let shrink = (candidate.len() - cap).max(64);
+        if body_budget <= shrink {
+            body_budget = 1;
+        } else {
+            body_budget -= shrink;
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::types::{Content, ExtractionResult, Metadata};
+
+    fn make_result(markdown: &str) -> ExtractionResult {
+        ExtractionResult {
+            metadata: Metadata {
+                title: Some("Test Page".to_string()),
+                description: None,
+                author: None,
+                published_date: None,
+                language: None,
+                url: Some("https://example.com/".to_string()),
+                site_name: None,
+                image: None,
+                favicon: None,
+                word_count: 0,
+            },
+            content: Content {
+                markdown: markdown.to_string(),
+                plain_text: String::new(),
+                links: Vec::new(),
+                images: Vec::new(),
+                code_blocks: Vec::new(),
+                raw_html: None,
+            },
+            domain_data: None,
+            structured_data: Vec::new(),
+        }
+    }
+
+    // -- truncation tests --
+
+    #[test]
+    fn test_max_output_bytes_truncates_correctly() {
+        // Build a ~100KB ASCII input.
+        let input = "a".repeat(100_000);
+        let out = truncate_with_footer(&input, 4096);
+        assert!(out.len() <= 4096, "got {} bytes, cap 4096", out.len());
+        assert!(out.contains("[truncated:"), "footer missing: {out}");
+        assert!(out.contains("100000 bytes"), "original byte count missing: {out}");
+        // The dropped-byte count in the footer must equal original - kept.
+        // Body kept = out.len() - footer_len. Footer ends with \n.
+        let footer_start = out.find("[truncated:").expect("footer present");
+        let body_kept = footer_start.saturating_sub(1); // minus the newline before the footer
+        let dropped = 100_000usize.saturating_sub(body_kept);
+        let needle = format!("[truncated: {dropped} more bytes");
+        assert!(
+            out.contains(&needle),
+            "expected dropped={dropped} in footer; got: {}",
+            &out[footer_start..]
+        );
+    }
+
+    #[test]
+    fn test_max_output_bytes_zero_means_unlimited() {
+        let input = "a".repeat(100_000);
+        let out = truncate_with_footer(&input, 0);
+        assert_eq!(out, input);
+        assert!(!out.contains("[truncated:"));
+    }
+
+    #[test]
+    fn test_max_output_bytes_utf8_boundary() {
+        // Mix multibyte and ASCII so the boundary lands mid-codepoint if naive.
+        // 'é' is 2 bytes in UTF-8. Build a string where byte 4095 is in the
+        // middle of an 'é'.
+        let mut s = String::new();
+        // 4094 ASCII bytes
+        for _ in 0..4094 {
+            s.push('a');
+        }
+        // Then an 'é' that straddles byte 4094..4096
+        s.push('é');
+        // Pad to make it big enough to need truncation.
+        for _ in 0..1000 {
+            s.push('b');
+        }
+        let cap = 4096;
+        let out = truncate_with_footer(&s, cap);
+        // The truncated form must be valid UTF-8 (String guarantees this,
+        // but also assert no mid-codepoint by re-decoding).
+        assert!(std::str::from_utf8(out.as_bytes()).is_ok());
+        // It must contain the footer (we truncated).
+        assert!(out.contains("[truncated:"), "footer missing");
+        // Must not contain "ab" right at the cap (boundary should fall before 'é').
+        // Verify the body part (before the footer line) ends at a valid char boundary.
+        let footer_start = out.find("\n[truncated:").unwrap_or(out.len());
+        let body = &out[..footer_start];
+        // The last char must NOT be the first byte of a multibyte sequence alone.
+        let _ = std::str::from_utf8(body.as_bytes()).expect("body is valid UTF-8");
+    }
+
+    // -- mode tests --
+
+    #[test]
+    fn test_mode_summary_returns_links_only() {
+        let md = r"# Some Headline
+
+This is body text that summary mode should NOT include.
+
+Read more articles:
+
+- [Story One](https://example.com/story1)
+- [Story Two](https://example.com/story2)
+- [Story Three](https://example.com/story3)
+- [Story Four](https://example.com/story4)
+- [Story Five](https://example.com/story5)
+";
+        let r = make_result(md);
+        let out = to_llm_summary(&r, Some("https://example.com/"));
+        // Should contain all 5 links.
+        for n in ["Story One", "Story Two", "Story Three", "Story Four", "Story Five"] {
+            assert!(out.contains(n), "summary missing link {n}: {out}");
+        }
+        // Should NOT contain the body sentence.
+        assert!(
+            !out.contains("This is body text"),
+            "summary leaked body text: {out}"
+        );
+        // Should have a Links section header.
+        assert!(out.contains("## Links"), "missing Links header: {out}");
+    }
+
+    #[test]
+    fn test_mode_toc_returns_outline() {
+        let md = r"# Top Level Title
+
+Intro paragraph that should not be associated with H1.
+
+## Section A
+
+First paragraph of section A goes here.
+
+More body text we don't want as intro.
+
+## Section B
+
+First paragraph of section B.
+
+## Section C
+
+First paragraph of section C.
+";
+        let r = make_result(md);
+        let out = to_llm_toc(&r, Some("https://example.com/"));
+        // Should have one H1 and three H2s.
+        assert!(out.contains("# Top Level Title"), "missing H1: {out}");
+        assert!(out.contains("## Section A"), "missing H2-A: {out}");
+        assert!(out.contains("## Section B"), "missing H2-B: {out}");
+        assert!(out.contains("## Section C"), "missing H2-C: {out}");
+        // Should have the first paragraph for each H2.
+        assert!(
+            out.contains("First paragraph of section A"),
+            "missing intro A: {out}"
+        );
+        assert!(
+            out.contains("First paragraph of section B"),
+            "missing intro B: {out}"
+        );
+        assert!(
+            out.contains("First paragraph of section C"),
+            "missing intro C: {out}"
+        );
+        // Should NOT contain the second-paragraph-after-A body line.
+        assert!(
+            !out.contains("More body text"),
+            "toc leaked second paragraph: {out}"
+        );
+
+        // Structured entries: 1 H1 + 3 H2s.
+        let entries = collect_toc_entries(&r);
+        assert_eq!(entries.len(), 4, "expected 4 entries, got {entries:?}");
+        assert_eq!(entries[0].level, 1);
+        assert_eq!(entries[1].level, 2);
+    }
+
+    #[test]
+    fn test_mode_summary_with_byte_cap() {
+        // Generate a summary that's bigger than the cap, then verify cap applies.
+        let mut md = String::from("# Lots of links\n\n");
+        for i in 0..200 {
+            md.push_str(&format!(
+                "- [Story number {i} with a fairly long title]({})\n",
+                format!("https://example.com/story-{i}")
+            ));
+        }
+        let r = make_result(&md);
+        let summary = to_llm_summary(&r, Some("https://example.com/"));
+        assert!(summary.len() > 4096, "expected summary > cap; got {}", summary.len());
+        let capped = truncate_with_footer(&summary, 4096);
+        assert!(capped.len() <= 4096, "got {} bytes", capped.len());
+        assert!(capped.contains("[truncated:"));
+    }
+
+    #[test]
+    fn test_json_summary_shape() {
+        let md = "# T\n\n- [A](https://example.com/a)\n- [B](https://example.com/b)\n";
+        let r = make_result(md);
+        let s = to_json_summary(&r);
+        let v: serde_json::Value = serde_json::from_str(&s).expect("valid JSON");
+        let arr = v.as_array().expect("array");
+        assert_eq!(arr.len(), 2);
+        assert_eq!(arr[0]["title"].as_str().unwrap(), "A");
+        assert_eq!(arr[0]["url"].as_str().unwrap(), "https://example.com/a");
+    }
+
+    #[test]
+    fn test_json_toc_shape() {
+        let md = "# H1\n\n## A\n\nIntro A.\n\n## B\n\nIntro B.\n";
+        let r = make_result(md);
+        let s = to_json_toc(&r);
+        let v: serde_json::Value = serde_json::from_str(&s).expect("valid JSON");
+        let arr = v.as_array().expect("array");
+        assert_eq!(arr.len(), 3);
+        assert_eq!(arr[0]["level"].as_u64().unwrap(), 1);
+        assert_eq!(arr[0]["heading"].as_str().unwrap(), "H1");
+        assert_eq!(arr[1]["level"].as_u64().unwrap(), 2);
+        assert_eq!(arr[1]["intro"].as_str().unwrap(), "Intro A.");
+    }
+
+    #[test]
+    fn test_json_truncation_remains_valid_json() {
+        // Build a big serialized JSON.
+        let huge = serde_json::json!({
+            "data": "x".repeat(100_000),
+        });
+        let s = serde_json::to_string_pretty(&huge).unwrap();
+        let out = truncate_json_with_wrapper(&s, 4096);
+        // Resulting string must parse as JSON.
+        let parsed: serde_json::Value =
+            serde_json::from_str(&out).expect("truncated JSON should still parse");
+        assert_eq!(parsed["_truncated"].as_bool(), Some(true));
+        assert!(parsed["_original_bytes"].as_u64().is_some());
+        assert!(out.len() <= 4096);
+    }
+}