feat(cli): add --max-output-bytes and --mode summary,toc for output-size control

Three additive CLI flags addressing the 50KB persisted-output cap that
trips Claude Code's per-tool-result harness on aggregator front pages
(apnews.com, cnbc.com/markets/, b92.net all >50KB by default):

--max-output-bytes N: truncates final output at N bytes with a clear
'[truncated: M more bytes ...]' footer. N=0 means unlimited (default).
UTF-8 codepoint-boundary safe; also wraps JSON output so truncated
output stays parseable.

--mode summary: returns only the extracted link list (titles + URLs),
no body text. For aggregator front pages where the LLM is going to
drill the individual articles next anyway.

--mode toc: returns H1/H2 outline + first paragraph after each H2.
For long single-article pages.

New flags are orthogonal to -f (json/llm/text). 9 new unit tests in
webclaw-core, total goes 308 -> 317 passing. Smoke-tested on
apnews.com (51713 -> 27404 summary -> 6269 toc -> 8193 capped),
pitchfork.com (42049 -> 379 summary), cnbc.com (56682 -> 16385 capped).
This commit is contained in:
devnen 2026-05-23 18:17:42 +02:00
parent 562c6a15f0
commit 339f41bb7c
4 changed files with 756 additions and 54 deletions

View file

@ -170,6 +170,16 @@ struct Cli {
#[arg(short, long, default_value = "markdown")]
format: OutputFormat,
/// Output mode: full (default), summary (link list), or toc (H1/H2 outline + first paragraph).
/// Orthogonal to --format; e.g. `-f json --mode summary` returns a JSON link array.
#[arg(long, default_value = "full")]
mode: OutputMode,
/// Cap the final output at N bytes; on overflow truncate at a UTF-8 boundary
/// and append a [truncated: N more bytes ...] footer. 0 = no cap (default).
#[arg(long, default_value = "0")]
max_output_bytes: u64,
/// Browser to impersonate
#[arg(short, long, default_value = "chrome")]
browser: Browser,
@ -413,6 +423,17 @@ enum OutputFormat {
Html,
}
/// Output mode. `full` is the default and matches the historical
/// behaviour; `summary` returns just the navigation/link list; `toc`
/// returns the H1/H2 outline plus the first paragraph after each H2.
/// Orthogonal to `--format`.
#[derive(Clone, ValueEnum, PartialEq, Eq)]
enum OutputMode {
Full,
Summary,
Toc,
}
#[derive(Clone, ValueEnum)]
enum Browser {
Chrome,
@ -719,26 +740,80 @@ fn raw_html_or_markdown(result: &ExtractionResult) -> &str {
/// Format an `ExtractionResult` into a string for the given output format.
fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) -> String {
format_output_with_mode(result, format, show_metadata, &OutputMode::Full, 0)
}
/// Format an `ExtractionResult` for the given format and mode, then apply
/// the byte cap. Returns the final string ready for stdout / disk.
///
/// `mode == Full` reproduces the legacy behaviour exactly.
/// `mode == Summary` returns just the link list (text-formats) or a JSON
/// array of `{title, url}` (json format).
/// `mode == Toc` returns an H1/H2 outline + first paragraph after each H2.
///
/// `max_output_bytes == 0` disables the cap. Otherwise the output is
/// truncated at a UTF-8 boundary with a `[truncated: ...]` footer
/// (or a `_truncated` wrapper for JSON, so the document stays parseable).
fn format_output_with_mode(
result: &ExtractionResult,
format: &OutputFormat,
show_metadata: bool,
mode: &OutputMode,
max_output_bytes: u64,
) -> String {
let body = render_body(result, format, show_metadata, mode);
apply_byte_cap(&body, format, max_output_bytes)
}
fn render_body(
result: &ExtractionResult,
format: &OutputFormat,
show_metadata: bool,
mode: &OutputMode,
) -> String {
match mode {
OutputMode::Summary => match format {
OutputFormat::Json => webclaw_core::to_json_summary(result),
_ => webclaw_core::to_llm_summary(result, result.metadata.url.as_deref()),
},
OutputMode::Toc => match format {
OutputFormat::Json => webclaw_core::to_json_toc(result),
_ => webclaw_core::to_llm_toc(result, result.metadata.url.as_deref()),
},
OutputMode::Full => match format {
OutputFormat::Markdown => {
let mut out = String::new();
if show_metadata {
out.push_str(&format_frontmatter(&result.metadata));
}
out.push_str(&result.content.markdown);
if !result.structured_data.is_empty() {
out.push_str("\n\n## Structured Data\n\n```json\n");
out.push_str(
&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default(),
);
out.push_str("\n```");
}
out
}
OutputFormat::Json => {
serde_json::to_string_pretty(result).expect("serialization failed")
}
OutputFormat::Text => result.content.plain_text.clone(),
OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
OutputFormat::Html => raw_html_or_markdown(result).to_string(),
},
}
}
fn apply_byte_cap(body: &str, format: &OutputFormat, cap: u64) -> String {
if cap == 0 {
return body.to_string();
}
let cap = cap as usize;
match format {
OutputFormat::Markdown => {
let mut out = String::new();
if show_metadata {
out.push_str(&format_frontmatter(&result.metadata));
}
out.push_str(&result.content.markdown);
if !result.structured_data.is_empty() {
out.push_str("\n\n## Structured Data\n\n```json\n");
out.push_str(
&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default(),
);
out.push_str("\n```");
}
out
}
OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"),
OutputFormat::Text => result.content.plain_text.clone(),
OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
OutputFormat::Html => raw_html_or_markdown(result).to_string(),
OutputFormat::Json => webclaw_core::truncate_json_with_wrapper(body, cap),
_ => webclaw_core::truncate_with_footer(body, cap),
}
}
@ -1036,37 +1111,15 @@ fn format_frontmatter(meta: &Metadata) -> String {
lines.join("\n")
}
fn print_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) {
match format {
OutputFormat::Markdown => {
if show_metadata {
print!("{}", format_frontmatter(&result.metadata));
}
println!("{}", result.content.markdown);
if !result.structured_data.is_empty() {
println!(
"\n## Structured Data\n\n```json\n{}\n```",
serde_json::to_string_pretty(&result.structured_data).unwrap_or_default()
);
}
}
OutputFormat::Json => {
// serde_json::to_string_pretty won't fail on our types
println!(
"{}",
serde_json::to_string_pretty(result).expect("serialization failed")
);
}
OutputFormat::Text => {
println!("{}", result.content.plain_text);
}
OutputFormat::Llm => {
println!("{}", to_llm_text(result, result.metadata.url.as_deref()));
}
OutputFormat::Html => {
println!("{}", raw_html_or_markdown(result));
}
}
fn print_output_with_mode(
result: &ExtractionResult,
format: &OutputFormat,
show_metadata: bool,
mode: &OutputMode,
max_output_bytes: u64,
) {
let out = format_output_with_mode(result, format, show_metadata, mode, max_output_bytes);
println!("{out}");
}
/// Print cloud API response in the requested format.
@ -1132,6 +1185,53 @@ fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) {
}
}
/// Render the cloud response into a string per `format`, then apply
/// `--max-output-bytes` if non-zero. Mirrors `print_cloud_output` exactly
/// when `cap == 0`.
fn print_cloud_output_capped(resp: &serde_json::Value, format: &OutputFormat, cap: u64) {
if cap == 0 {
print_cloud_output(resp, format);
return;
}
let body = render_cloud_body(resp, format);
println!("{}", apply_byte_cap(&body, format, cap));
}
fn render_cloud_body(resp: &serde_json::Value, format: &OutputFormat) -> String {
match format {
OutputFormat::Json => {
serde_json::to_string_pretty(resp).expect("serialization failed")
}
OutputFormat::Markdown => resp
.get("content")
.and_then(|c| c.get("markdown"))
.and_then(|m| m.as_str())
.map(|s| s.to_string())
.or_else(|| resp.get("markdown").and_then(|m| m.as_str()).map(|s| s.to_string()))
.unwrap_or_else(|| {
serde_json::to_string_pretty(resp).expect("serialization failed")
}),
OutputFormat::Text => resp
.get("content")
.and_then(|c| c.get("plain_text"))
.and_then(|t| t.as_str())
.map(|s| s.to_string())
.unwrap_or_else(|| render_cloud_body(resp, &OutputFormat::Markdown)),
OutputFormat::Llm => resp
.get("content")
.and_then(|c| c.get("llm_text"))
.and_then(|t| t.as_str())
.map(|s| s.to_string())
.unwrap_or_else(|| render_cloud_body(resp, &OutputFormat::Markdown)),
OutputFormat::Html => resp
.get("content")
.and_then(|c| c.get("raw_html"))
.and_then(|h| h.as_str())
.map(|s| s.to_string())
.unwrap_or_else(|| render_cloud_body(resp, &OutputFormat::Markdown)),
}
}
fn print_diff_output(diff: &ContentDiff, format: &OutputFormat) {
match format {
OutputFormat::Json => {
@ -2662,17 +2762,33 @@ async fn main() {
.unwrap_or_default();
let custom_name = entries.first().and_then(|(_, name)| name.clone());
let filename = custom_name.unwrap_or_else(|| url_to_filename(&url, &cli.format));
let content = format_output(&result, &cli.format, cli.metadata);
let content = format_output_with_mode(
&result,
&cli.format,
cli.metadata,
&cli.mode,
cli.max_output_bytes,
);
if let Err(e) = write_to_file(dir, &filename, &content) {
eprintln!("error: {e}");
process::exit(1);
}
} else {
print_output(&result, &cli.format, cli.metadata);
print_output_with_mode(
&result,
&cli.format,
cli.metadata,
&cli.mode,
cli.max_output_bytes,
);
}
}
Ok(FetchOutput::Cloud(resp)) => {
print_cloud_output(&resp, &cli.format);
// Cloud path does not yet have a structured ExtractionResult,
// so --mode summary/toc can't be applied here. We still apply
// the byte cap to the rendered cloud output by routing through
// a helper that prints to a buffer first.
print_cloud_output_capped(&resp, &cli.format, cli.max_output_bytes);
}
Err(e) => {
eprintln!("{e}");

View file

@ -25,7 +25,10 @@ pub use brand::BrandIdentity;
pub use diff::{ChangeStatus, ContentDiff, MetadataChange};
pub use domain::DomainType;
pub use error::ExtractError;
pub use llm::to_llm_text;
pub use llm::{
to_json_summary, to_json_toc, to_llm_summary, to_llm_text, to_llm_toc,
truncate_json_with_wrapper, truncate_with_footer,
};
pub use types::{
CodeBlock, Content, DomainData, ExtractionOptions, ExtractionResult, Image, Link, Metadata,
};

View file

@ -9,6 +9,12 @@ mod cleanup;
mod images;
mod links;
mod metadata;
mod output_size;
pub use output_size::{
to_json_summary, to_json_toc, to_llm_summary, to_llm_toc, truncate_json_with_wrapper,
truncate_with_footer,
};
use crate::types::ExtractionResult;

View file

@ -0,0 +1,577 @@
/// Output-size control: alternate output modes (summary, toc) plus
/// post-format byte-cap truncation with a clear footer.
///
/// Three orthogonal axes:
/// - `OutputMode` (full | summary | toc) selects what to emit
/// - `OutputFormat` (text/markdown vs json) is owned by the caller
/// - `max_output_bytes` caps the FINAL byte count after format emission
///
/// `summary` returns a navigation/link list extracted from the page.
/// `toc` returns the H1/H2 outline plus the first paragraph after each H2.
/// `truncate_with_footer` walks UTF-8 codepoint boundaries so it never
/// produces an invalid UTF-8 split.
use crate::types::ExtractionResult;
use super::body;
use super::links;
use super::metadata::build_metadata_header;
// ---------------------------------------------------------------------------
// Summary mode — link/title list, no body
// ---------------------------------------------------------------------------
/// Build a markdown link list (`- [Title](URL)`) of all non-noise links on
/// the page. Includes the metadata header so callers can still see what
/// page the summary came from.
pub fn to_llm_summary(result: &ExtractionResult, url: Option<&str>) -> String {
let links = collect_summary_links(result);
let mut out = String::new();
build_metadata_header(&mut out, result, url);
if !out.is_empty() {
out.push('\n');
}
out.push_str("## Links\n");
for (label, href) in &links {
out.push_str(&format!("- [{label}]({href})\n"));
}
out.trim_end().to_string()
}
/// JSON form of the summary: an array of `{"title": ..., "url": ...}`.
pub fn to_json_summary(result: &ExtractionResult) -> String {
let links = collect_summary_links(result);
let arr: Vec<serde_json::Value> = links
.into_iter()
.map(|(title, url)| {
serde_json::json!({
"title": title,
"url": url,
})
})
.collect();
serde_json::to_string_pretty(&arr).unwrap_or_else(|_| "[]".to_string())
}
/// Collect a deduplicated (label, href) list from the page, reusing the
/// same noise-filter the main LLM output uses so summary stays consistent
/// with the existing extraction.
fn collect_summary_links(result: &ExtractionResult) -> Vec<(String, String)> {
// Run the existing body pipeline; it already produces a clean, deduped
// (label, href) list with noise links filtered out.
let processed = body::process_body(&result.content.markdown);
let mut out: Vec<(String, String)> = Vec::with_capacity(processed.links.len());
for (text, href) in processed.links {
let label = links::clean_link_label(&text);
if label.is_empty() {
continue;
}
out.push((label, href));
}
out
}
// ---------------------------------------------------------------------------
// TOC mode — H1/H2 outline + first paragraph after each H2
// ---------------------------------------------------------------------------
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct TocEntry {
pub level: u8,
pub heading: String,
pub intro: String,
}
/// Build a markdown outline from the processed body. Each H1 / H2 is
/// emitted as a heading line; the first non-empty, non-heading paragraph
/// immediately after an H2 is emitted as its `intro`.
pub fn to_llm_toc(result: &ExtractionResult, url: Option<&str>) -> String {
let entries = collect_toc_entries(result);
let mut out = String::new();
build_metadata_header(&mut out, result, url);
if !out.is_empty() {
out.push('\n');
}
for entry in &entries {
let hashes = "#".repeat(entry.level as usize);
out.push_str(&format!("{hashes} {}\n", entry.heading));
if !entry.intro.is_empty() {
out.push('\n');
out.push_str(&entry.intro);
out.push_str("\n\n");
} else {
out.push('\n');
}
}
out.trim_end().to_string()
}
/// JSON form of the TOC: an array of `{"level": N, "heading": ..., "intro": ...}`.
pub fn to_json_toc(result: &ExtractionResult) -> String {
let entries = collect_toc_entries(result);
let arr: Vec<serde_json::Value> = entries
.into_iter()
.map(|e| {
serde_json::json!({
"level": e.level,
"heading": e.heading,
"intro": e.intro,
})
})
.collect();
serde_json::to_string_pretty(&arr).unwrap_or_else(|_| "[]".to_string())
}
/// Walk the processed body text, pulling out H1/H2 headings and the first
/// paragraph that follows each H2.
pub(crate) fn collect_toc_entries(result: &ExtractionResult) -> Vec<TocEntry> {
let processed = body::process_body(&result.content.markdown);
let text = &processed.text;
let mut entries: Vec<TocEntry> = Vec::new();
let mut current_h2_idx: Option<usize> = None;
let mut paragraph: String = String::new();
let mut in_paragraph = false;
let flush_paragraph =
|paragraph: &mut String, in_paragraph: &mut bool, current_h2_idx: &mut Option<usize>, entries: &mut Vec<TocEntry>| {
if *in_paragraph {
let trimmed = paragraph.trim().to_string();
if !trimmed.is_empty()
&& let Some(idx) = *current_h2_idx
&& entries[idx].intro.is_empty()
{
entries[idx].intro = trimmed;
*current_h2_idx = None;
}
paragraph.clear();
*in_paragraph = false;
}
};
for line in text.lines() {
let trimmed = line.trim_start();
if let Some(rest) = trimmed.strip_prefix("# ") {
flush_paragraph(&mut paragraph, &mut in_paragraph, &mut current_h2_idx, &mut entries);
entries.push(TocEntry {
level: 1,
heading: rest.trim().to_string(),
intro: String::new(),
});
current_h2_idx = None;
} else if let Some(rest) = trimmed.strip_prefix("## ") {
flush_paragraph(&mut paragraph, &mut in_paragraph, &mut current_h2_idx, &mut entries);
entries.push(TocEntry {
level: 2,
heading: rest.trim().to_string(),
intro: String::new(),
});
current_h2_idx = Some(entries.len() - 1);
} else if trimmed.starts_with("#") {
// H3+ — ignore for the outline, but ends any in-progress intro paragraph.
flush_paragraph(&mut paragraph, &mut in_paragraph, &mut current_h2_idx, &mut entries);
} else if trimmed.is_empty() {
flush_paragraph(&mut paragraph, &mut in_paragraph, &mut current_h2_idx, &mut entries);
} else {
// Body text. Only collect intros for the most-recent H2 with no intro yet.
if let Some(idx) = current_h2_idx
&& entries[idx].intro.is_empty()
{
if in_paragraph {
paragraph.push(' ');
}
paragraph.push_str(trimmed);
in_paragraph = true;
}
}
}
// End-of-text flush
flush_paragraph(&mut paragraph, &mut in_paragraph, &mut current_h2_idx, &mut entries);
entries
}
// ---------------------------------------------------------------------------
// Byte-cap truncation
// ---------------------------------------------------------------------------
/// Truncate `s` so the returned string is at most `cap` bytes long,
/// honoring UTF-8 codepoint boundaries and appending a footer that names
/// how many bytes were dropped.
///
/// - `cap == 0` is treated as "no cap" — returns `s` unchanged.
/// - If `s.len() <= cap`, no footer is appended.
/// - When truncation happens, the FOOTER is included inside the cap:
/// the kept-body bytes + footer bytes never exceed `cap` (best-effort —
/// if `cap` is smaller than the footer itself, the body is empty and
/// the footer alone is returned, possibly slightly over cap; this only
/// happens for absurdly small caps like `--max-output-bytes 50`).
pub fn truncate_with_footer(s: &str, cap: usize) -> String {
if cap == 0 {
return s.to_string();
}
let original_bytes = s.len();
if original_bytes <= cap {
return s.to_string();
}
// First pass: build a placeholder footer to learn its byte length.
// We don't yet know `kept` (depends on cap minus footer), so we use
// a worst-case estimate for the byte counts and rebuild once. Two
// passes is fine and avoids fixed-point loops.
let placeholder_footer = build_footer(original_bytes, original_bytes, original_bytes);
let footer_max_len = placeholder_footer.len();
// Reserve room for the footer + a separator newline. Without the
// explicit '+1', the body can end mid-text and the inserted '\n'
// before the footer pushes us 1 byte over the cap.
let body_budget = cap.saturating_sub(footer_max_len).saturating_sub(1);
// Walk to the largest codepoint boundary <= body_budget.
let mut kept_bytes = 0usize;
for (i, _) in s.char_indices() {
if i > body_budget {
break;
}
kept_bytes = i;
}
// If body_budget falls past end-of-string somehow, clamp.
if kept_bytes > original_bytes {
kept_bytes = original_bytes;
}
let dropped_bytes = original_bytes - kept_bytes;
let footer = build_footer(original_bytes, dropped_bytes, kept_bytes);
let mut out = String::with_capacity(kept_bytes + footer.len() + 1);
out.push_str(&s[..kept_bytes]);
// Make sure the footer starts on its own line if the body didn't end with one.
if !out.is_empty() && !out.ends_with('\n') {
out.push('\n');
}
out.push_str(&footer);
out
}
fn build_footer(original_bytes: usize, dropped_bytes: usize, _kept_bytes: usize) -> String {
format!(
"[truncated: {dropped_bytes} more bytes — original output was {original_bytes} bytes; pass --max-output-bytes 0 to disable, or increase the cap]\n"
)
}
/// JSON-aware truncation: when a JSON document is too large, we don't
/// truncate the JSON itself (that would produce invalid syntax). Instead
/// we emit a wrapper object that names the truncation and embeds a
/// best-effort string prefix of the original JSON.
///
/// This is what `--max-output-bytes N -f json` returns when the rendered
/// JSON would exceed N bytes.
pub fn truncate_json_with_wrapper(s: &str, cap: usize) -> String {
if cap == 0 {
return s.to_string();
}
let original_bytes = s.len();
if original_bytes <= cap {
return s.to_string();
}
// Build the wrapper skeleton first to learn its overhead, then size
// the embedded `data` slice to fit under the cap. We escape it as a
// JSON string so the document stays valid.
let wrapper = |kept_bytes: usize, data_escaped: &str| -> String {
serde_json::json!({
"_truncated": true,
"_original_bytes": original_bytes,
"_truncated_bytes": original_bytes - kept_bytes,
"_note": "pass --max-output-bytes 0 to disable, or increase the cap",
"data": data_escaped,
})
.to_string()
};
// Estimate overhead with an empty data string.
let overhead = wrapper(0, "").len();
// Each character of data may take up to 6 bytes when escaped (\uXXXX),
// but ASCII typically takes 1 — we conservatively budget for 2× growth
// and iterate down if we overshoot.
let mut body_budget = cap.saturating_sub(overhead).saturating_sub(8) / 2;
if body_budget == 0 {
body_budget = 1;
}
loop {
// Walk to the largest codepoint boundary <= body_budget.
let mut kept_bytes = 0usize;
for (i, _) in s.char_indices() {
if i > body_budget {
break;
}
kept_bytes = i;
}
if kept_bytes > original_bytes {
kept_bytes = original_bytes;
}
let escaped = serde_json::to_string(&s[..kept_bytes]).unwrap_or_else(|_| "\"\"".to_string());
// Strip outer quotes from the escaped string for embedding.
let inner = if escaped.len() >= 2 {
&escaped[1..escaped.len() - 1]
} else {
""
};
let candidate = wrapper(kept_bytes, inner);
if candidate.len() <= cap || body_budget <= 1 {
return candidate;
}
// Overshoot — shrink body_budget and retry.
let shrink = (candidate.len() - cap).max(64);
if body_budget <= shrink {
body_budget = 1;
} else {
body_budget -= shrink;
}
}
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
use crate::types::{Content, ExtractionResult, Metadata};
fn make_result(markdown: &str) -> ExtractionResult {
ExtractionResult {
metadata: Metadata {
title: Some("Test Page".to_string()),
description: None,
author: None,
published_date: None,
language: None,
url: Some("https://example.com/".to_string()),
site_name: None,
image: None,
favicon: None,
word_count: 0,
},
content: Content {
markdown: markdown.to_string(),
plain_text: String::new(),
links: Vec::new(),
images: Vec::new(),
code_blocks: Vec::new(),
raw_html: None,
},
domain_data: None,
structured_data: Vec::new(),
}
}
// -- truncation tests --
#[test]
fn test_max_output_bytes_truncates_correctly() {
// Build a ~100KB ASCII input.
let input = "a".repeat(100_000);
let out = truncate_with_footer(&input, 4096);
assert!(out.len() <= 4096, "got {} bytes, cap 4096", out.len());
assert!(out.contains("[truncated:"), "footer missing: {out}");
assert!(out.contains("100000 bytes"), "original byte count missing: {out}");
// The dropped-byte count in the footer must equal original - kept.
// Body kept = out.len() - footer_len. Footer ends with \n.
let footer_start = out.find("[truncated:").expect("footer present");
let body_kept = footer_start.saturating_sub(1); // minus the newline before the footer
let dropped = 100_000usize.saturating_sub(body_kept);
let needle = format!("[truncated: {dropped} more bytes");
assert!(
out.contains(&needle),
"expected dropped={dropped} in footer; got: {}",
&out[footer_start..]
);
}
#[test]
fn test_max_output_bytes_zero_means_unlimited() {
let input = "a".repeat(100_000);
let out = truncate_with_footer(&input, 0);
assert_eq!(out, input);
assert!(!out.contains("[truncated:"));
}
#[test]
fn test_max_output_bytes_utf8_boundary() {
// Mix multibyte and ASCII so the boundary lands mid-codepoint if naive.
// 'é' is 2 bytes in UTF-8. Build a string where byte 4095 is in the
// middle of an 'é'.
let mut s = String::new();
// 4094 ASCII bytes
for _ in 0..4094 {
s.push('a');
}
// Then an 'é' that straddles byte 4094..4096
s.push('é');
// Pad to make it big enough to need truncation.
for _ in 0..1000 {
s.push('b');
}
let cap = 4096;
let out = truncate_with_footer(&s, cap);
// The truncated form must be valid UTF-8 (String guarantees this,
// but also assert no mid-codepoint by re-decoding).
assert!(std::str::from_utf8(out.as_bytes()).is_ok());
// It must contain the footer (we truncated).
assert!(out.contains("[truncated:"), "footer missing");
// Must not contain "ab" right at the cap (boundary should fall before 'é').
// Verify the body part (before the footer line) ends at a valid char boundary.
let footer_start = out.find("\n[truncated:").unwrap_or(out.len());
let body = &out[..footer_start];
// The last char must NOT be the first byte of a multibyte sequence alone.
let _ = std::str::from_utf8(body.as_bytes()).expect("body is valid UTF-8");
}
// -- mode tests --
#[test]
fn test_mode_summary_returns_links_only() {
let md = r"# Some Headline
This is body text that summary mode should NOT include.
Read more articles:
- [Story One](https://example.com/story1)
- [Story Two](https://example.com/story2)
- [Story Three](https://example.com/story3)
- [Story Four](https://example.com/story4)
- [Story Five](https://example.com/story5)
";
let r = make_result(md);
let out = to_llm_summary(&r, Some("https://example.com/"));
// Should contain all 5 links.
for n in ["Story One", "Story Two", "Story Three", "Story Four", "Story Five"] {
assert!(out.contains(n), "summary missing link {n}: {out}");
}
// Should NOT contain the body sentence.
assert!(
!out.contains("This is body text"),
"summary leaked body text: {out}"
);
// Should have a Links section header.
assert!(out.contains("## Links"), "missing Links header: {out}");
}
#[test]
fn test_mode_toc_returns_outline() {
let md = r"# Top Level Title
Intro paragraph that should not be associated with H1.
## Section A
First paragraph of section A goes here.
More body text we don't want as intro.
## Section B
First paragraph of section B.
## Section C
First paragraph of section C.
";
let r = make_result(md);
let out = to_llm_toc(&r, Some("https://example.com/"));
// Should have one H1 and three H2s.
assert!(out.contains("# Top Level Title"), "missing H1: {out}");
assert!(out.contains("## Section A"), "missing H2-A: {out}");
assert!(out.contains("## Section B"), "missing H2-B: {out}");
assert!(out.contains("## Section C"), "missing H2-C: {out}");
// Should have the first paragraph for each H2.
assert!(
out.contains("First paragraph of section A"),
"missing intro A: {out}"
);
assert!(
out.contains("First paragraph of section B"),
"missing intro B: {out}"
);
assert!(
out.contains("First paragraph of section C"),
"missing intro C: {out}"
);
// Should NOT contain the second-paragraph-after-A body line.
assert!(
!out.contains("More body text"),
"toc leaked second paragraph: {out}"
);
// Structured entries: 1 H1 + 3 H2s.
let entries = collect_toc_entries(&r);
assert_eq!(entries.len(), 4, "expected 4 entries, got {entries:?}");
assert_eq!(entries[0].level, 1);
assert_eq!(entries[1].level, 2);
}
#[test]
fn test_mode_summary_with_byte_cap() {
// Generate a summary that's bigger than the cap, then verify cap applies.
let mut md = String::from("# Lots of links\n\n");
for i in 0..200 {
md.push_str(&format!(
"- [Story number {i} with a fairly long title]({})\n",
format!("https://example.com/story-{i}")
));
}
let r = make_result(&md);
let summary = to_llm_summary(&r, Some("https://example.com/"));
assert!(summary.len() > 4096, "expected summary > cap; got {}", summary.len());
let capped = truncate_with_footer(&summary, 4096);
assert!(capped.len() <= 4096, "got {} bytes", capped.len());
assert!(capped.contains("[truncated:"));
}
#[test]
fn test_json_summary_shape() {
let md = "# T\n\n- [A](https://example.com/a)\n- [B](https://example.com/b)\n";
let r = make_result(md);
let s = to_json_summary(&r);
let v: serde_json::Value = serde_json::from_str(&s).expect("valid JSON");
let arr = v.as_array().expect("array");
assert_eq!(arr.len(), 2);
assert_eq!(arr[0]["title"].as_str().unwrap(), "A");
assert_eq!(arr[0]["url"].as_str().unwrap(), "https://example.com/a");
}
#[test]
fn test_json_toc_shape() {
let md = "# H1\n\n## A\n\nIntro A.\n\n## B\n\nIntro B.\n";
let r = make_result(md);
let s = to_json_toc(&r);
let v: serde_json::Value = serde_json::from_str(&s).expect("valid JSON");
let arr = v.as_array().expect("array");
assert_eq!(arr.len(), 3);
assert_eq!(arr[0]["level"].as_u64().unwrap(), 1);
assert_eq!(arr[0]["heading"].as_str().unwrap(), "H1");
assert_eq!(arr[1]["level"].as_u64().unwrap(), 2);
assert_eq!(arr[1]["intro"].as_str().unwrap(), "Intro A.");
}
#[test]
fn test_json_truncation_remains_valid_json() {
// Build a big serialized JSON.
let huge = serde_json::json!({
"data": "x".repeat(100_000),
});
let s = serde_json::to_string_pretty(&huge).unwrap();
let out = truncate_json_with_wrapper(&s, 4096);
// Resulting string must parse as JSON.
let parsed: serde_json::Value =
serde_json::from_str(&out).expect("truncated JSON should still parse");
assert_eq!(parsed["_truncated"].as_bool(), Some(true));
assert!(parsed["_original_bytes"].as_u64().is_some());
assert!(out.len() <= 4096);
}
}