From aa561e976ab9011d189e64c9ca3fecb3fbcfac5c Mon Sep 17 00:00:00 2001 From: Valerio Date: Sun, 10 May 2026 15:06:34 +0200 Subject: [PATCH] Polish llm output quality fixes --- CHANGELOG.md | 9 +++ Cargo.lock | 14 ++--- Cargo.toml | 2 +- crates/webclaw-core/src/llm/cleanup.rs | 12 +++- crates/webclaw-core/src/llm/links.rs | 18 +++++- crates/webclaw-core/src/llm/mod.rs | 39 ++++++++---- crates/webclaw-core/src/markdown.rs | 83 ++++++++++++++++++++++++-- 7 files changed, 151 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7858ae4..025b1db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,15 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.6.0] — 2026-05-10 + +### Fixed +- Improved `--format llm` output quality on modern news and documentation pages. Framework hydration blobs and low-value page chrome structured-data records are now filtered out before they can flood the LLM context, while content-bearing Schema.org records are preserved. Thanks and congrats to Nenad Oric (`@devnen`) for the contribution in PR #37. +- Fixed element-to-text spacing so adjacent inline nodes no longer smash words together, while punctuation stays attached on real pages such as docs, forums, and reference sites. +- Removed common screen-reader-only link chrome such as "opens new tab" from LLM body text and link labels without stripping ordinary prose that happens to mention external links. + +--- + ## [0.5.9] — 2026-05-06 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index e49ccc3..ab23a3f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3219,7 +3219,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.5.9" +version = "0.6.0" dependencies = [ "clap", "dotenvy", @@ -3240,7 +3240,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.5.9" +version = "0.6.0" dependencies = [ "ego-tree", "once_cell", @@ -3258,7 +3258,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.5.9" +version = "0.6.0" dependencies = [ "async-trait", "bytes", @@ -3284,7 +3284,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.5.9" +version = "0.6.0" dependencies = [ "async-trait", "reqwest", @@ -3297,7 +3297,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.5.9" +version = "0.6.0" dependencies = [ "dirs", "dotenvy", @@ -3317,7 +3317,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.5.9" +version = "0.6.0" dependencies = [ "pdf-extract", "thiserror", @@ -3326,7 +3326,7 @@ dependencies = [ [[package]] name = "webclaw-server" -version = "0.5.9" +version = "0.6.0" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index 12a4b73..6e87225 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.5.9" +version = "0.6.0" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-core/src/llm/cleanup.rs b/crates/webclaw-core/src/llm/cleanup.rs index 1f79361..dc447a5 100644 --- a/crates/webclaw-core/src/llm/cleanup.rs +++ b/crates/webclaw-core/src/llm/cleanup.rs @@ -160,7 +160,7 @@ pub(crate) fn strip_leaked_js(input: &str) -> String { pub(crate) fn strip_a11y_link_chrome(input: &str) -> String { static A11Y_PATTERN: Lazy = Lazy::new(|| { Regex::new( - r"(?i)\s*,?\s*\b(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?", + r"(?i)(?:\s*,\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?|\s+\((?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\)\.?|\s+external link\b\.?$)", ) .unwrap() }); @@ -1424,13 +1424,19 @@ mod tests { #[test] fn a11y_preserves_code_blocks() { - let input = "```\nopens new tab is a function\n```\nopens new tab here"; + let input = "```\nopens new tab is a function\n```\nDownload, opens new tab"; let out = strip_a11y_link_chrome(input); assert!( out.contains("opens new tab is a function"), "code stripped: {out}" ); // Outside the fence, the chrome is removed. - assert!(!out.ends_with("opens new tab here")); + assert!(!out.to_lowercase().contains("download, opens new tab")); + } + + #[test] + fn a11y_preserves_external_link_prose() { + let input = "Researchers found an external link between the two incidents."; + assert_eq!(strip_a11y_link_chrome(input), input); } } diff --git a/crates/webclaw-core/src/llm/links.rs b/crates/webclaw-core/src/llm/links.rs index 9873182..3d25179 100644 --- a/crates/webclaw-core/src/llm/links.rs +++ b/crates/webclaw-core/src/llm/links.rs @@ -90,7 +90,7 @@ static MD_MARKERS_RE: Lazy = static A11Y_LABEL_RE: Lazy = Lazy::new(|| { Regex::new( - r"(?i)\s*,?\s*\b(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?", + r"(?i)(?:\s*,?\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website))\b\.?|\s*,\s*external link\b\.?|\s+external link\b\.?$)", ) .unwrap() }); @@ -190,4 +190,20 @@ mod tests { assert!(is_noise_link("user", "https://hn.com/user?id=foo")); assert!(!is_noise_link("Rust docs", "https://rust-lang.org")); } + + #[test] + fn link_label_preserves_external_link_prose() { + assert_eq!( + clean_link_label("Research found an external link between incidents"), + "Research found an external link between incidents" + ); + } + + #[test] + fn link_label_strips_terminal_external_link_chrome() { + assert_eq!( + clean_link_label("Reuters story external link"), + "Reuters story" + ); + } } diff --git a/crates/webclaw-core/src/llm/mod.rs b/crates/webclaw-core/src/llm/mod.rs index 7314cbe..bc65be6 100644 --- a/crates/webclaw-core/src/llm/mod.rs +++ b/crates/webclaw-core/src/llm/mod.rs @@ -80,26 +80,32 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String { /// hydration state. fn is_useful_structured_data(v: &serde_json::Value) -> bool { let Some(obj) = v.as_object() else { + // SvelteKit can emit compact arrays of page data. Keep those if they + // are small enough to be useful, while still dropping giant hydration + // arrays under the same budget as untyped objects. + if v.is_array() { + let serialized = serde_json::to_string(v).unwrap_or_default(); + return serialized.len() <= 4 * 1024; + } return false; }; // JSON-LD: @type drives the decision. if let Some(t) = obj.get("@type") { - let type_str = match t { - serde_json::Value::String(s) => s.clone(), + let types: Vec = match t { + serde_json::Value::String(s) => vec![s.to_ascii_lowercase()], serde_json::Value::Array(a) => a .iter() .filter_map(|x| x.as_str()) - .collect::>() - .join(","), - _ => String::new(), + .map(str::to_ascii_lowercase) + .collect(), + _ => Vec::new(), }; - let lower = type_str.to_ascii_lowercase(); - // Drop low-info chrome types. - const DROP_TYPES: &[&str] = &["website", "webpage", "sitenavigationelement"]; - if DROP_TYPES.iter().any(|d| lower == *d) { + if types.is_empty() { return false; } - return !lower.is_empty(); + // Drop low-info chrome types. + const DROP_TYPES: &[&str] = &["website", "webpage", "sitenavigationelement"]; + return types.iter().any(|t| !DROP_TYPES.iter().any(|d| t == d)); } // Next.js pageProps / SvelteKit data without @type: keep only if compact. // Anything over ~4KB is almost certainly hydration state, not content. @@ -821,4 +827,17 @@ mod tests { "Compact untyped dropped: {out}" ); } + + #[test] + fn structured_data_keeps_compact_untyped_array() { + // SvelteKit can emit compact arrays rather than objects. + let r = make_result_with_structured(vec![serde_json::json!([ + { "title": "Hi", "body": "small array item" } + ])]); + let out = to_llm_text(&r, None); + assert!( + out.contains("small array item"), + "Compact untyped array dropped: {out}" + ); + } } diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs index cacadb2..2699166 100644 --- a/crates/webclaw-core/src/markdown.rs +++ b/crates/webclaw-core/src/markdown.rs @@ -367,11 +367,65 @@ fn inline_text( /// Check whether a space is needed between two adjacent chunks of output. /// Returns true when the left side doesn't end with whitespace and the right -/// side doesn't start with whitespace — i.e., two words would be mashed together. +/// side doesn't start with whitespace, except around punctuation that should +/// bind to the adjacent token. fn needs_separator(left: &str, right: &str) -> bool { - let l = left.as_bytes().last().copied().unwrap_or(b' '); - let r = right.as_bytes().first().copied().unwrap_or(b' '); - !l.is_ascii_whitespace() && !r.is_ascii_whitespace() + let l = left.chars().next_back().unwrap_or(' '); + let r = right.chars().next().unwrap_or(' '); + + if l.is_whitespace() || r.is_whitespace() { + return false; + } + + // Do not create "word ," / "word )" / "word 's" artifacts. + if is_closing_punctuation(r) { + return false; + } + + // Do not create "( word" / "[ 1" artifacts. + if is_opening_punctuation(l) { + return false; + } + + // Common inline-code suffixes: `Option`s, `x`'s. Treat them like a + // single token rather than separating the text node. + if matches!(l, '`' | ')') && starts_with_inline_code_suffix(right) { + return false; + } + + true +} + +fn starts_with_inline_code_suffix(s: &str) -> bool { + let trimmed = s.trim_start_matches(['*', '_']); + let mut chars = trimmed.chars(); + let Some(first) = chars.next() else { + return false; + }; + + if matches!(first, '\'' | '’') { + return true; + } + + if !matches!(first, 's' | 'S') { + return false; + } + + match chars.next() { + None => true, + Some(c) => c.is_whitespace() || is_closing_punctuation(c) || matches!(c, '*' | '_'), + } +} + +fn is_closing_punctuation(c: char) -> bool { + matches!( + c, + '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '}' | '%' | '\'' | '’' | '"' | '”' + ) +} + +fn is_opening_punctuation(c: char) -> bool { + matches!(c, '(' | '[' | '{' | '"' | '“') } /// Collect raw text content (no markdown formatting). @@ -1626,4 +1680,25 @@ mod tests { "Element->Text->Element smashed together: {md}" ); } + + #[test] + fn punctuation_after_inline_element_stays_attached() { + let html = r#"

Hello, world. Use package.json.

"#; + let (md, _, _) = convert_html(html, None); + assert!(md.contains("Hello, world"), "punctuation detached: {md}"); + assert!( + md.contains("`package.json`."), + "code punctuation detached: {md}" + ); + } + + #[test] + fn inline_code_suffix_stays_attached() { + let html = r#"

NullPointerExceptions are common.

"#; + let (md, _, _) = convert_html(html, None); + assert!( + md.contains("[`NullPointerException`](https://example.com)*s* are common"), + "code suffix detached: {md}" + ); + } }