From aa561e976ab9011d189e64c9ca3fecb3fbcfac5c Mon Sep 17 00:00:00 2001
From: Valerio <massimianivalerio1@gmail.com>
Date: Sun, 10 May 2026 15:06:34 +0200
Subject: [PATCH] Polish llm output quality fixes

---
 CHANGELOG.md                           |  9 +++
 Cargo.lock                             | 14 ++---
 Cargo.toml                             |  2 +-
 crates/webclaw-core/src/llm/cleanup.rs | 12 +++-
 crates/webclaw-core/src/llm/links.rs   | 18 +++++-
 crates/webclaw-core/src/llm/mod.rs     | 39 ++++++++----
 crates/webclaw-core/src/markdown.rs    | 83 ++++++++++++++++++++++++--
 7 files changed, 151 insertions(+), 26 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7858ae4..025b1db 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,15 @@
 All notable changes to webclaw are documented here.
 Format follows [Keep a Changelog](https://keepachangelog.com/).
 
+## [0.6.0] — 2026-05-10
+
+### Fixed
+- Improved `--format llm` output quality on modern news and documentation pages. Framework hydration blobs and low-value page chrome structured-data records are now filtered out before they can flood the LLM context, while content-bearing Schema.org records are preserved. Thanks and congrats to Nenad Oric (`@devnen`) for the contribution in PR #37.
+- Fixed element-to-text spacing so adjacent inline nodes no longer smash words together, while punctuation stays attached on real pages such as docs, forums, and reference sites.
+- Removed common screen-reader-only link chrome such as "opens new tab" from LLM body text and link labels without stripping ordinary prose that happens to mention external links.
+
+---
+
 ## [0.5.9] — 2026-05-06
 
 ### Fixed
diff --git a/Cargo.lock b/Cargo.lock
index e49ccc3..ab23a3f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3219,7 +3219,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-cli"
-version = "0.5.9"
+version = "0.6.0"
 dependencies = [
  "clap",
  "dotenvy",
@@ -3240,7 +3240,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-core"
-version = "0.5.9"
+version = "0.6.0"
 dependencies = [
  "ego-tree",
  "once_cell",
@@ -3258,7 +3258,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-fetch"
-version = "0.5.9"
+version = "0.6.0"
 dependencies = [
  "async-trait",
  "bytes",
@@ -3284,7 +3284,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-llm"
-version = "0.5.9"
+version = "0.6.0"
 dependencies = [
  "async-trait",
  "reqwest",
@@ -3297,7 +3297,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-mcp"
-version = "0.5.9"
+version = "0.6.0"
 dependencies = [
  "dirs",
  "dotenvy",
@@ -3317,7 +3317,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-pdf"
-version = "0.5.9"
+version = "0.6.0"
 dependencies = [
  "pdf-extract",
  "thiserror",
@@ -3326,7 +3326,7 @@ dependencies = [
 
 [[package]]
 name = "webclaw-server"
-version = "0.5.9"
+version = "0.6.0"
 dependencies = [
  "anyhow",
  "axum",
diff --git a/Cargo.toml b/Cargo.toml
index 12a4b73..6e87225 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
 members = ["crates/*"]
 
 [workspace.package]
-version = "0.5.9"
+version = "0.6.0"
 edition = "2024"
 license = "AGPL-3.0"
 repository = "https://github.com/0xMassi/webclaw"
diff --git a/crates/webclaw-core/src/llm/cleanup.rs b/crates/webclaw-core/src/llm/cleanup.rs
index 1f79361..dc447a5 100644
--- a/crates/webclaw-core/src/llm/cleanup.rs
+++ b/crates/webclaw-core/src/llm/cleanup.rs
@@ -160,7 +160,7 @@ pub(crate) fn strip_leaked_js(input: &str) -> String {
 pub(crate) fn strip_a11y_link_chrome(input: &str) -> String {
     static A11Y_PATTERN: Lazy<Regex> = Lazy::new(|| {
         Regex::new(
-            r"(?i)\s*,?\s*\b(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?",
+            r"(?i)(?:\s*,\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?|\s+\((?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\)\.?|\s+external link\b\.?$)",
         )
         .unwrap()
     });
@@ -1424,13 +1424,19 @@ mod tests {
 
     #[test]
     fn a11y_preserves_code_blocks() {
-        let input = "```\nopens new tab is a function\n```\nopens new tab here";
+        let input = "```\nopens new tab is a function\n```\nDownload, opens new tab";
         let out = strip_a11y_link_chrome(input);
         assert!(
             out.contains("opens new tab is a function"),
             "code stripped: {out}"
         );
         // Outside the fence, the chrome is removed.
-        assert!(!out.ends_with("opens new tab here"));
+        assert!(!out.to_lowercase().contains("download, opens new tab"));
+    }
+
+    #[test]
+    fn a11y_preserves_external_link_prose() {
+        let input = "Researchers found an external link between the two incidents.";
+        assert_eq!(strip_a11y_link_chrome(input), input);
     }
 }
diff --git a/crates/webclaw-core/src/llm/links.rs b/crates/webclaw-core/src/llm/links.rs
index 9873182..3d25179 100644
--- a/crates/webclaw-core/src/llm/links.rs
+++ b/crates/webclaw-core/src/llm/links.rs
@@ -90,7 +90,7 @@ static MD_MARKERS_RE: Lazy<Regex> =
 
 static A11Y_LABEL_RE: Lazy<Regex> = Lazy::new(|| {
     Regex::new(
-        r"(?i)\s*,?\s*\b(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?",
+        r"(?i)(?:\s*,?\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website))\b\.?|\s*,\s*external link\b\.?|\s+external link\b\.?$)",
     )
     .unwrap()
 });
@@ -190,4 +190,20 @@ mod tests {
         assert!(is_noise_link("user", "https://hn.com/user?id=foo"));
         assert!(!is_noise_link("Rust docs", "https://rust-lang.org"));
     }
+
+    #[test]
+    fn link_label_preserves_external_link_prose() {
+        assert_eq!(
+            clean_link_label("Research found an external link between incidents"),
+            "Research found an external link between incidents"
+        );
+    }
+
+    #[test]
+    fn link_label_strips_terminal_external_link_chrome() {
+        assert_eq!(
+            clean_link_label("Reuters story external link"),
+            "Reuters story"
+        );
+    }
 }
diff --git a/crates/webclaw-core/src/llm/mod.rs b/crates/webclaw-core/src/llm/mod.rs
index 7314cbe..bc65be6 100644
--- a/crates/webclaw-core/src/llm/mod.rs
+++ b/crates/webclaw-core/src/llm/mod.rs
@@ -80,26 +80,32 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
 /// hydration state.
 fn is_useful_structured_data(v: &serde_json::Value) -> bool {
     let Some(obj) = v.as_object() else {
+        // SvelteKit can emit compact arrays of page data. Keep those if they
+        // are small enough to be useful, while still dropping giant hydration
+        // arrays under the same budget as untyped objects.
+        if v.is_array() {
+            let serialized = serde_json::to_string(v).unwrap_or_default();
+            return serialized.len() <= 4 * 1024;
+        }
         return false;
     };
     // JSON-LD: @type drives the decision.
     if let Some(t) = obj.get("@type") {
-        let type_str = match t {
-            serde_json::Value::String(s) => s.clone(),
+        let types: Vec<String> = match t {
+            serde_json::Value::String(s) => vec![s.to_ascii_lowercase()],
             serde_json::Value::Array(a) => a
                 .iter()
                 .filter_map(|x| x.as_str())
-                .collect::<Vec<_>>()
-                .join(","),
-            _ => String::new(),
+                .map(str::to_ascii_lowercase)
+                .collect(),
+            _ => Vec::new(),
         };
-        let lower = type_str.to_ascii_lowercase();
-        // Drop low-info chrome types.
-        const DROP_TYPES: &[&str] = &["website", "webpage", "sitenavigationelement"];
-        if DROP_TYPES.iter().any(|d| lower == *d) {
+        if types.is_empty() {
             return false;
         }
-        return !lower.is_empty();
+        // Drop low-info chrome types.
+        const DROP_TYPES: &[&str] = &["website", "webpage", "sitenavigationelement"];
+        return types.iter().any(|t| !DROP_TYPES.iter().any(|d| t == d));
     }
     // Next.js pageProps / SvelteKit data without @type: keep only if compact.
     // Anything over ~4KB is almost certainly hydration state, not content.
@@ -821,4 +827,17 @@ mod tests {
             "Compact untyped dropped: {out}"
         );
     }
+
+    #[test]
+    fn structured_data_keeps_compact_untyped_array() {
+        // SvelteKit can emit compact arrays rather than objects.
+        let r = make_result_with_structured(vec![serde_json::json!([
+            { "title": "Hi", "body": "small array item" }
+        ])]);
+        let out = to_llm_text(&r, None);
+        assert!(
+            out.contains("small array item"),
+            "Compact untyped array dropped: {out}"
+        );
+    }
 }
diff --git a/crates/webclaw-core/src/markdown.rs b/crates/webclaw-core/src/markdown.rs
index cacadb2..2699166 100644
--- a/crates/webclaw-core/src/markdown.rs
+++ b/crates/webclaw-core/src/markdown.rs
@@ -367,11 +367,65 @@ fn inline_text(
 
 /// Check whether a space is needed between two adjacent chunks of output.
 /// Returns true when the left side doesn't end with whitespace and the right
-/// side doesn't start with whitespace — i.e., two words would be mashed together.
+/// side doesn't start with whitespace, except around punctuation that should
+/// bind to the adjacent token.
 fn needs_separator(left: &str, right: &str) -> bool {
-    let l = left.as_bytes().last().copied().unwrap_or(b' ');
-    let r = right.as_bytes().first().copied().unwrap_or(b' ');
-    !l.is_ascii_whitespace() && !r.is_ascii_whitespace()
+    let l = left.chars().next_back().unwrap_or(' ');
+    let r = right.chars().next().unwrap_or(' ');
+
+    if l.is_whitespace() || r.is_whitespace() {
+        return false;
+    }
+
+    // Do not create "word ," / "word )" / "word 's" artifacts.
+    if is_closing_punctuation(r) {
+        return false;
+    }
+
+    // Do not create "( word" / "[ 1" artifacts.
+    if is_opening_punctuation(l) {
+        return false;
+    }
+
+    // Common inline-code suffixes: `Option`s, `x`'s. Treat them like a
+    // single token rather than separating the text node.
+    if matches!(l, '`' | ')') && starts_with_inline_code_suffix(right) {
+        return false;
+    }
+
+    true
+}
+
+fn starts_with_inline_code_suffix(s: &str) -> bool {
+    let trimmed = s.trim_start_matches(['*', '_']);
+    let mut chars = trimmed.chars();
+    let Some(first) = chars.next() else {
+        return false;
+    };
+
+    if matches!(first, '\'' | '’') {
+        return true;
+    }
+
+    if !matches!(first, 's' | 'S') {
+        return false;
+    }
+
+    match chars.next() {
+        None => true,
+        Some(c) => c.is_whitespace() || is_closing_punctuation(c) || matches!(c, '*' | '_'),
+    }
+}
+
+fn is_closing_punctuation(c: char) -> bool {
+    matches!(
+        c,
+        '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '}' | '%' | '\'' | '’' | '"' | '”'
+    )
+}
+
+fn is_opening_punctuation(c: char) -> bool {
+    matches!(c, '(' | '[' | '{' | '"' | '“')
 }
 
 /// Collect raw text content (no markdown formatting).
@@ -1626,4 +1680,25 @@ mod tests {
             "Element->Text->Element smashed together: {md}"
         );
     }
+
+    #[test]
+    fn punctuation_after_inline_element_stays_attached() {
+        let html = r#"<p><span>Hello</span>, world. Use <code>package.json</code>.</p>"#;
+        let (md, _, _) = convert_html(html, None);
+        assert!(md.contains("Hello, world"), "punctuation detached: {md}");
+        assert!(
+            md.contains("`package.json`."),
+            "code punctuation detached: {md}"
+        );
+    }
+
+    #[test]
+    fn inline_code_suffix_stays_attached() {
+        let html = r#"<p><a href="https://example.com"><code>NullPointerException</code></a><em>s</em> are common.</p>"#;
+        let (md, _, _) = convert_html(html, None);
+        assert!(
+            md.contains("[`NullPointerException`](https://example.com)*s* are common"),
+            "code suffix detached: {md}"
+        );
+    }
 }