mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-10 22:45:13 +02:00
feat(cli): add --max-output-bytes and --mode summary,toc for output-size control
Three additive CLI flags addressing the 50KB persisted-output cap that trips Claude Code's per-tool-result harness on aggregator front pages (apnews.com, cnbc.com/markets/, b92.net all >50KB by default): --max-output-bytes N: truncates final output at N bytes with a clear '[truncated: M more bytes ...]' footer. N=0 means unlimited (default). UTF-8 codepoint-boundary safe; also wraps JSON output so truncated output stays parseable. --mode summary: returns only the extracted link list (titles + URLs), no body text. For aggregator front pages where the LLM is going to drill the individual articles next anyway. --mode toc: returns H1/H2 outline + first paragraph after each H2. For long single-article pages. New flags are orthogonal to -f (json/llm/text). 9 new unit tests in webclaw-core, total goes 308 -> 317 passing. Smoke-tested on apnews.com (51713 -> 27404 summary -> 6269 toc -> 8193 capped), pitchfork.com (42049 -> 379 summary), cnbc.com (56682 -> 16385 capped).
This commit is contained in:
parent
562c6a15f0
commit
339f41bb7c
4 changed files with 756 additions and 54 deletions
|
|
@ -170,6 +170,16 @@ struct Cli {
|
|||
#[arg(short, long, default_value = "markdown")]
|
||||
format: OutputFormat,
|
||||
|
||||
/// Output mode: full (default), summary (link list), or toc (H1/H2 outline + first paragraph).
|
||||
/// Orthogonal to --format; e.g. `-f json --mode summary` returns a JSON link array.
|
||||
#[arg(long, default_value = "full")]
|
||||
mode: OutputMode,
|
||||
|
||||
/// Cap the final output at N bytes; on overflow truncate at a UTF-8 boundary
|
||||
/// and append a [truncated: N more bytes ...] footer. 0 = no cap (default).
|
||||
#[arg(long, default_value = "0")]
|
||||
max_output_bytes: u64,
|
||||
|
||||
/// Browser to impersonate
|
||||
#[arg(short, long, default_value = "chrome")]
|
||||
browser: Browser,
|
||||
|
|
@ -413,6 +423,17 @@ enum OutputFormat {
|
|||
Html,
|
||||
}
|
||||
|
||||
/// Output mode. `full` is the default and matches the historical
|
||||
/// behaviour; `summary` returns just the navigation/link list; `toc`
|
||||
/// returns the H1/H2 outline plus the first paragraph after each H2.
|
||||
/// Orthogonal to `--format`.
|
||||
#[derive(Clone, ValueEnum, PartialEq, Eq)]
|
||||
enum OutputMode {
|
||||
Full,
|
||||
Summary,
|
||||
Toc,
|
||||
}
|
||||
|
||||
#[derive(Clone, ValueEnum)]
|
||||
enum Browser {
|
||||
Chrome,
|
||||
|
|
@ -719,26 +740,80 @@ fn raw_html_or_markdown(result: &ExtractionResult) -> &str {
|
|||
|
||||
/// Format an `ExtractionResult` into a string for the given output format.
|
||||
fn format_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) -> String {
|
||||
format_output_with_mode(result, format, show_metadata, &OutputMode::Full, 0)
|
||||
}
|
||||
|
||||
/// Format an `ExtractionResult` for the given format and mode, then apply
|
||||
/// the byte cap. Returns the final string ready for stdout / disk.
|
||||
///
|
||||
/// `mode == Full` reproduces the legacy behaviour exactly.
|
||||
/// `mode == Summary` returns just the link list (text-formats) or a JSON
|
||||
/// array of `{title, url}` (json format).
|
||||
/// `mode == Toc` returns an H1/H2 outline + first paragraph after each H2.
|
||||
///
|
||||
/// `max_output_bytes == 0` disables the cap. Otherwise the output is
|
||||
/// truncated at a UTF-8 boundary with a `[truncated: ...]` footer
|
||||
/// (or a `_truncated` wrapper for JSON, so the document stays parseable).
|
||||
fn format_output_with_mode(
|
||||
result: &ExtractionResult,
|
||||
format: &OutputFormat,
|
||||
show_metadata: bool,
|
||||
mode: &OutputMode,
|
||||
max_output_bytes: u64,
|
||||
) -> String {
|
||||
let body = render_body(result, format, show_metadata, mode);
|
||||
apply_byte_cap(&body, format, max_output_bytes)
|
||||
}
|
||||
|
||||
fn render_body(
|
||||
result: &ExtractionResult,
|
||||
format: &OutputFormat,
|
||||
show_metadata: bool,
|
||||
mode: &OutputMode,
|
||||
) -> String {
|
||||
match mode {
|
||||
OutputMode::Summary => match format {
|
||||
OutputFormat::Json => webclaw_core::to_json_summary(result),
|
||||
_ => webclaw_core::to_llm_summary(result, result.metadata.url.as_deref()),
|
||||
},
|
||||
OutputMode::Toc => match format {
|
||||
OutputFormat::Json => webclaw_core::to_json_toc(result),
|
||||
_ => webclaw_core::to_llm_toc(result, result.metadata.url.as_deref()),
|
||||
},
|
||||
OutputMode::Full => match format {
|
||||
OutputFormat::Markdown => {
|
||||
let mut out = String::new();
|
||||
if show_metadata {
|
||||
out.push_str(&format_frontmatter(&result.metadata));
|
||||
}
|
||||
out.push_str(&result.content.markdown);
|
||||
if !result.structured_data.is_empty() {
|
||||
out.push_str("\n\n## Structured Data\n\n```json\n");
|
||||
out.push_str(
|
||||
&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default(),
|
||||
);
|
||||
out.push_str("\n```");
|
||||
}
|
||||
out
|
||||
}
|
||||
OutputFormat::Json => {
|
||||
serde_json::to_string_pretty(result).expect("serialization failed")
|
||||
}
|
||||
OutputFormat::Text => result.content.plain_text.clone(),
|
||||
OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
|
||||
OutputFormat::Html => raw_html_or_markdown(result).to_string(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn apply_byte_cap(body: &str, format: &OutputFormat, cap: u64) -> String {
|
||||
if cap == 0 {
|
||||
return body.to_string();
|
||||
}
|
||||
let cap = cap as usize;
|
||||
match format {
|
||||
OutputFormat::Markdown => {
|
||||
let mut out = String::new();
|
||||
if show_metadata {
|
||||
out.push_str(&format_frontmatter(&result.metadata));
|
||||
}
|
||||
out.push_str(&result.content.markdown);
|
||||
if !result.structured_data.is_empty() {
|
||||
out.push_str("\n\n## Structured Data\n\n```json\n");
|
||||
out.push_str(
|
||||
&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default(),
|
||||
);
|
||||
out.push_str("\n```");
|
||||
}
|
||||
out
|
||||
}
|
||||
OutputFormat::Json => serde_json::to_string_pretty(result).expect("serialization failed"),
|
||||
OutputFormat::Text => result.content.plain_text.clone(),
|
||||
OutputFormat::Llm => to_llm_text(result, result.metadata.url.as_deref()),
|
||||
OutputFormat::Html => raw_html_or_markdown(result).to_string(),
|
||||
OutputFormat::Json => webclaw_core::truncate_json_with_wrapper(body, cap),
|
||||
_ => webclaw_core::truncate_with_footer(body, cap),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1036,37 +1111,15 @@ fn format_frontmatter(meta: &Metadata) -> String {
|
|||
lines.join("\n")
|
||||
}
|
||||
|
||||
fn print_output(result: &ExtractionResult, format: &OutputFormat, show_metadata: bool) {
|
||||
match format {
|
||||
OutputFormat::Markdown => {
|
||||
if show_metadata {
|
||||
print!("{}", format_frontmatter(&result.metadata));
|
||||
}
|
||||
println!("{}", result.content.markdown);
|
||||
if !result.structured_data.is_empty() {
|
||||
println!(
|
||||
"\n## Structured Data\n\n```json\n{}\n```",
|
||||
serde_json::to_string_pretty(&result.structured_data).unwrap_or_default()
|
||||
);
|
||||
}
|
||||
}
|
||||
OutputFormat::Json => {
|
||||
// serde_json::to_string_pretty won't fail on our types
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(result).expect("serialization failed")
|
||||
);
|
||||
}
|
||||
OutputFormat::Text => {
|
||||
println!("{}", result.content.plain_text);
|
||||
}
|
||||
OutputFormat::Llm => {
|
||||
println!("{}", to_llm_text(result, result.metadata.url.as_deref()));
|
||||
}
|
||||
OutputFormat::Html => {
|
||||
println!("{}", raw_html_or_markdown(result));
|
||||
}
|
||||
}
|
||||
fn print_output_with_mode(
|
||||
result: &ExtractionResult,
|
||||
format: &OutputFormat,
|
||||
show_metadata: bool,
|
||||
mode: &OutputMode,
|
||||
max_output_bytes: u64,
|
||||
) {
|
||||
let out = format_output_with_mode(result, format, show_metadata, mode, max_output_bytes);
|
||||
println!("{out}");
|
||||
}
|
||||
|
||||
/// Print cloud API response in the requested format.
|
||||
|
|
@ -1132,6 +1185,53 @@ fn print_cloud_output(resp: &serde_json::Value, format: &OutputFormat) {
|
|||
}
|
||||
}
|
||||
|
||||
/// Render the cloud response into a string per `format`, then apply
|
||||
/// `--max-output-bytes` if non-zero. Mirrors `print_cloud_output` exactly
|
||||
/// when `cap == 0`.
|
||||
fn print_cloud_output_capped(resp: &serde_json::Value, format: &OutputFormat, cap: u64) {
|
||||
if cap == 0 {
|
||||
print_cloud_output(resp, format);
|
||||
return;
|
||||
}
|
||||
let body = render_cloud_body(resp, format);
|
||||
println!("{}", apply_byte_cap(&body, format, cap));
|
||||
}
|
||||
|
||||
fn render_cloud_body(resp: &serde_json::Value, format: &OutputFormat) -> String {
|
||||
match format {
|
||||
OutputFormat::Json => {
|
||||
serde_json::to_string_pretty(resp).expect("serialization failed")
|
||||
}
|
||||
OutputFormat::Markdown => resp
|
||||
.get("content")
|
||||
.and_then(|c| c.get("markdown"))
|
||||
.and_then(|m| m.as_str())
|
||||
.map(|s| s.to_string())
|
||||
.or_else(|| resp.get("markdown").and_then(|m| m.as_str()).map(|s| s.to_string()))
|
||||
.unwrap_or_else(|| {
|
||||
serde_json::to_string_pretty(resp).expect("serialization failed")
|
||||
}),
|
||||
OutputFormat::Text => resp
|
||||
.get("content")
|
||||
.and_then(|c| c.get("plain_text"))
|
||||
.and_then(|t| t.as_str())
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| render_cloud_body(resp, &OutputFormat::Markdown)),
|
||||
OutputFormat::Llm => resp
|
||||
.get("content")
|
||||
.and_then(|c| c.get("llm_text"))
|
||||
.and_then(|t| t.as_str())
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| render_cloud_body(resp, &OutputFormat::Markdown)),
|
||||
OutputFormat::Html => resp
|
||||
.get("content")
|
||||
.and_then(|c| c.get("raw_html"))
|
||||
.and_then(|h| h.as_str())
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| render_cloud_body(resp, &OutputFormat::Markdown)),
|
||||
}
|
||||
}
|
||||
|
||||
fn print_diff_output(diff: &ContentDiff, format: &OutputFormat) {
|
||||
match format {
|
||||
OutputFormat::Json => {
|
||||
|
|
@ -2662,17 +2762,33 @@ async fn main() {
|
|||
.unwrap_or_default();
|
||||
let custom_name = entries.first().and_then(|(_, name)| name.clone());
|
||||
let filename = custom_name.unwrap_or_else(|| url_to_filename(&url, &cli.format));
|
||||
let content = format_output(&result, &cli.format, cli.metadata);
|
||||
let content = format_output_with_mode(
|
||||
&result,
|
||||
&cli.format,
|
||||
cli.metadata,
|
||||
&cli.mode,
|
||||
cli.max_output_bytes,
|
||||
);
|
||||
if let Err(e) = write_to_file(dir, &filename, &content) {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
} else {
|
||||
print_output(&result, &cli.format, cli.metadata);
|
||||
print_output_with_mode(
|
||||
&result,
|
||||
&cli.format,
|
||||
cli.metadata,
|
||||
&cli.mode,
|
||||
cli.max_output_bytes,
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(FetchOutput::Cloud(resp)) => {
|
||||
print_cloud_output(&resp, &cli.format);
|
||||
// Cloud path does not yet have a structured ExtractionResult,
|
||||
// so --mode summary/toc can't be applied here. We still apply
|
||||
// the byte cap to the rendered cloud output by routing through
|
||||
// a helper that prints to a buffer first.
|
||||
print_cloud_output_capped(&resp, &cli.format, cli.max_output_bytes);
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("{e}");
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue