Improve --format llm output quality (#37)

Improve LLM-format output for modern news and documentation pages. - Filter noisy hydration and low-value page chrome structured data while preserving content-bearing Schema.org records - Fix element/text spacing without detaching punctuation on docs, forums, and reference pages - Remove common accessibility link chrome from LLM text and link labels - Bump workspace version to 0.6.0 and update the changelog Thanks to Nenad Oric (@devnen) for the original PR and contribution.
2026-07-23 07:21:02 +02:00 · 2026-05-10 15:11:12 +02:00 · 2026-05-10 15:11:12 +02:00 · e8ca1417d6
commit e8ca1417d6
parent 7f75143954
8 changed files with 371 additions and 16 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,15 @@
 All notable changes to webclaw are documented here.
 Format follows [Keep a Changelog](https://keepachangelog.com/).
 ## [0.6.0] — 2026-05-10
 ### Fixed
 - Improved `--format llm` output quality on modern news and documentation pages. Framework hydration blobs and low-value page chrome structured-data records are now filtered out before they can flood the LLM context, while content-bearing Schema.org records are preserved. Thanks and congrats to Nenad Oric (`@devnen`) for the contribution in PR #37.
 - Fixed element-to-text spacing so adjacent inline nodes no longer smash words together, while punctuation stays attached on real pages such as docs, forums, and reference sites.
 - Removed common screen-reader-only link chrome such as "opens new tab" from LLM body text and link labels without stripping ordinary prose that happens to mention external links.
 ---
 ## [0.5.9] — 2026-05-06
 ### Fixed
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3219,7 +3219,7 @@ dependencies = [
 [[package]]
 name = "webclaw-cli"
-version = "0.5.9"
+version = "0.6.0"
 dependencies = [
 "clap",
 "dotenvy",
@ -3240,7 +3240,7 @@ dependencies = [
 [[package]]
 name = "webclaw-core"
-version = "0.5.9"
+version = "0.6.0"
 dependencies = [
 "ego-tree",
 "once_cell",
@ -3258,7 +3258,7 @@ dependencies = [
 [[package]]
 name = "webclaw-fetch"
-version = "0.5.9"
+version = "0.6.0"
 dependencies = [
 "async-trait",
 "bytes",
@ -3284,7 +3284,7 @@ dependencies = [
 [[package]]
 name = "webclaw-llm"
-version = "0.5.9"
+version = "0.6.0"
 dependencies = [
 "async-trait",
 "reqwest",
@ -3297,7 +3297,7 @@ dependencies = [
 [[package]]
 name = "webclaw-mcp"
-version = "0.5.9"
+version = "0.6.0"
 dependencies = [
 "dirs",
 "dotenvy",
@ -3317,7 +3317,7 @@ dependencies = [
 [[package]]
 name = "webclaw-pdf"
-version = "0.5.9"
+version = "0.6.0"
 dependencies = [
 "pdf-extract",
 "thiserror",
@ -3326,7 +3326,7 @@ dependencies = [
 [[package]]
 name = "webclaw-server"
-version = "0.5.9"
+version = "0.6.0"
 dependencies = [
 "anyhow",
 "axum",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,7 +3,7 @@ resolver = "2"
 members = ["crates/*"]
 [workspace.package]
-version = "0.5.9"
+version = "0.6.0"
 edition = "2024"
 license = "AGPL-3.0"
 repository = "https://github.com/0xMassi/webclaw"
--- a/crates/webclaw-core/src/llm/body.rs
+++ b/crates/webclaw-core/src/llm/body.rs
@ -29,6 +29,9 @@ pub(crate) fn process_body(markdown: &str) -> ProcessedBody {
    // 0c. Strip leaked JavaScript (framework hydration, self.__wrap_n, etc.)
    let text = cleanup::strip_leaked_js(&text);
    // 0c2. Strip a11y link chrome ("opens new tab", external link hints)
    let text = cleanup::strip_a11y_link_chrome(&text);
    // 0d. Collapse spaced-out text (CSS animation artifacts like "S t a r t")
    // Must run before any dedup -- spaced text confuses word-based dedup.
    let text = cleanup::collapse_spaced_text(&text);
--- a/crates/webclaw-core/src/llm/cleanup.rs
+++ b/crates/webclaw-core/src/llm/cleanup.rs
@ -146,6 +146,45 @@ pub(crate) fn strip_leaked_js(input: &str) -> String {
    out
 }
 // ---------------------------------------------------------------------------
 // Accessibility link chrome ("opens new tab", "external link")
 // ---------------------------------------------------------------------------
 /// Strip screen-reader-only link chrome that bleeds into rendered text.
 ///
 /// Sites like Reuters wrap external/new-window links with hidden spans
 /// like `<span class="visually-hidden">, opens new tab</span>`. The noise
 /// filter can't reliably catch these (no consistent class hook across
 /// sites), so they end up duplicated all over the body text. This is a
 /// targeted text-level scrub of the most common phrasings.
 pub(crate) fn strip_a11y_link_chrome(input: &str) -> String {
    static A11Y_PATTERN: Lazy<Regex> = Lazy::new(|| {
        Regex::new(
            r"(?i)(?:\s*,\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\b\.?|\s+\((?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website)|external link)\)\.?|\s+external link\b\.?$)",
        )
        .unwrap()
    });
    let mut out = String::with_capacity(input.len());
    let mut in_code_fence = false;
    for (i, line) in input.lines().enumerate() {
        if i > 0 {
            out.push('\n');
        }
        if line.trim().starts_with("```") {
            in_code_fence = !in_code_fence;
            out.push_str(line);
            continue;
        }
        if in_code_fence {
            out.push_str(line);
            continue;
        }
        out.push_str(&A11Y_PATTERN.replace_all(line, ""));
    }
    out
 }
 // ---------------------------------------------------------------------------
 // Spaced-out text collapsing (CSS animation artifacts)
 // ---------------------------------------------------------------------------
@ -1356,4 +1395,48 @@ mod tests {
        let input = "```\nImage of something in code\n```";
        assert_eq!(strip_alt_text_noise(input), input);
    }
    #[test]
    fn a11y_strips_opens_new_tab() {
        let input = "Download the App, opens new tab and Subscribe, opens new tab.";
        let out = strip_a11y_link_chrome(input);
        assert!(!out.to_lowercase().contains("opens new tab"), "leak: {out}");
        assert!(out.contains("Download the App"));
        assert!(out.contains("Subscribe"));
    }
    #[test]
    fn a11y_strips_external_link_variants() {
        let cases = [
            ("Visit our docs, opens external link", "Visit our docs"),
            ("Click here, opens in a new window.", "Click here"),
            ("More info external link", "More info"),
        ];
        for (input, expected_prefix) in cases {
            let out = strip_a11y_link_chrome(input);
            assert!(
                out.starts_with(expected_prefix),
                "input={input:?} got={out:?}"
            );
            assert!(!out.to_lowercase().contains("opens"), "leak: {out}");
        }
    }
    #[test]
    fn a11y_preserves_code_blocks() {
        let input = "```\nopens new tab is a function\n```\nDownload, opens new tab";
        let out = strip_a11y_link_chrome(input);
        assert!(
            out.contains("opens new tab is a function"),
            "code stripped: {out}"
        );
        // Outside the fence, the chrome is removed.
        assert!(!out.to_lowercase().contains("download, opens new tab"));
    }
    #[test]
    fn a11y_preserves_external_link_prose() {
        let input = "Researchers found an external link between the two incidents.";
        assert_eq!(strip_a11y_link_chrome(input), input);
    }
 }
--- a/crates/webclaw-core/src/llm/links.rs
+++ b/crates/webclaw-core/src/llm/links.rs
@ -88,10 +88,19 @@ fn is_noise_link(text: &str, href: &str) -> bool {
 static MD_MARKERS_RE: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"#{1,6}\s+|\*{1,2}|_{1,2}|`").unwrap());
 static A11Y_LABEL_RE: Lazy<Regex> = Lazy::new(|| {
    Regex::new(
        r"(?i)(?:\s*,?\s*(?:opens (?:in )?(?:a )?new (?:tab|window)|opens external (?:link|website))\b\.?|\s*,\s*external link\b\.?|\s+external link\b\.?$)",
    )
    .unwrap()
 });
 /// Clean a link label: strip markdown, dedup repeated phrases, truncate.
 pub(crate) fn clean_link_label(raw: &str) -> String {
    // Strip markdown markers
    let label = MD_MARKERS_RE.replace_all(raw, "").to_string();
    // Strip a11y link chrome ("opens new tab", etc.)
    let label = A11Y_LABEL_RE.replace_all(&label, "").to_string();
    let label = label.split_whitespace().collect::<Vec<_>>().join(" ");
    // Dedup repeated phrases in label
@ -181,4 +190,20 @@ mod tests {
        assert!(is_noise_link("user", "https://hn.com/user?id=foo"));
        assert!(!is_noise_link("Rust docs", "https://rust-lang.org"));
    }
    #[test]
    fn link_label_preserves_external_link_prose() {
        assert_eq!(
            clean_link_label("Research found an external link between incidents"),
            "Research found an external link between incidents"
        );
    }
    #[test]
    fn link_label_strips_terminal_external_link_chrome() {
        assert_eq!(
            clean_link_label("Reuters story external link"),
            "Reuters story"
        );
    }
 }
--- a/crates/webclaw-core/src/llm/mod.rs
+++ b/crates/webclaw-core/src/llm/mod.rs
@ -46,15 +46,73 @@ pub fn to_llm_text(result: &ExtractionResult, url: Option<&str>) -> String {
    }
    // -- 4. Structured data (NEXT_DATA, SvelteKit, JSON-LD) --
-    if !result.structured_data.is_empty() {
+    // Only emit useful items: Schema.org records with a meaningful @type,
-        out.push_str("\n\n## Structured Data\n\n```json\n");
+    // and only if the total serialized size stays under a budget. Framework
-        out.push_str(&serde_json::to_string_pretty(&result.structured_data).unwrap_or_default());
+    // hydration blobs (Next.js pageProps full of ad-targeting flags, build
-        out.push_str("\n```");
+    // IDs, schedule paths) explode to hundreds of KB and drown the LLM in
    // noise — drop them rather than ship them.
    let useful: Vec<_> = result
        .structured_data
        .iter()
        .filter(|v| is_useful_structured_data(v))
        .cloned()
        .collect();
    if !useful.is_empty() {
        let serialized = serde_json::to_string_pretty(&useful).unwrap_or_default();
        const STRUCTURED_DATA_MAX_BYTES: usize = 16 * 1024;
        if serialized.len() <= STRUCTURED_DATA_MAX_BYTES {
            out.push_str("\n\n## Structured Data\n\n```json\n");
            out.push_str(&serialized);
            out.push_str("\n```");
        }
    }
    out.trim().to_string()
 }
 /// Decide whether a structured-data value carries content worth emitting.
 ///
 /// Schema.org records with a recognizable content `@type` (Article, NewsArticle,
 /// Product, Recipe, FAQPage, HowTo, Event, Person, Organization, BreadcrumbList,
 /// VideoObject, JobPosting, etc.) are kept. Generic `WebSite` / `WebPage` /
 /// `ItemList` records and Next.js `pageProps`-style blobs without a useful
 /// `@type` are dropped — they're almost always navigation chrome or framework
 /// hydration state.
 fn is_useful_structured_data(v: &serde_json::Value) -> bool {
    let Some(obj) = v.as_object() else {
        // SvelteKit can emit compact arrays of page data. Keep those if they
        // are small enough to be useful, while still dropping giant hydration
        // arrays under the same budget as untyped objects.
        if v.is_array() {
            let serialized = serde_json::to_string(v).unwrap_or_default();
            return serialized.len() <= 4 * 1024;
        }
        return false;
    };
    // JSON-LD: @type drives the decision.
    if let Some(t) = obj.get("@type") {
        let types: Vec<String> = match t {
            serde_json::Value::String(s) => vec![s.to_ascii_lowercase()],
            serde_json::Value::Array(a) => a
                .iter()
                .filter_map(|x| x.as_str())
                .map(str::to_ascii_lowercase)
                .collect(),
            _ => Vec::new(),
        };
        if types.is_empty() {
            return false;
        }
        // Drop low-info chrome types.
        const DROP_TYPES: &[&str] = &["website", "webpage", "sitenavigationelement"];
        return types.iter().any(|t| !DROP_TYPES.iter().any(|d| t == d));
    }
    // Next.js pageProps / SvelteKit data without @type: keep only if compact.
    // Anything over ~4KB is almost certainly hydration state, not content.
    let serialized = serde_json::to_string(v).unwrap_or_default();
    serialized.len() <= 4 * 1024
 }
 // ---------------------------------------------------------------------------
 // Integration tests that exercise the full pipeline through to_llm_text
 // ---------------------------------------------------------------------------
@ -700,4 +758,86 @@ mod tests {
        assert!(out.contains("Some content"), "Content before lost: {out}");
        assert!(out.contains("More content"), "Content after lost: {out}");
    }
    // -- Structured-data gating tests --
    fn make_result_with_structured(values: Vec<serde_json::Value>) -> ExtractionResult {
        let mut r = make_result("# Body");
        r.structured_data = values;
        r
    }
    #[test]
    fn structured_data_drops_chrome_types() {
        // WebSite/WebPage records are framework chrome — should be dropped.
        let r = make_result_with_structured(vec![serde_json::json!({
            "@type": "WebSite",
            "name": "Example",
            "url": "https://example.com"
        })]);
        let out = to_llm_text(&r, None);
        assert!(
            !out.contains("## Structured Data"),
            "WebSite chrome leaked into output: {out}"
        );
    }
    #[test]
    fn structured_data_keeps_article_types() {
        let r = make_result_with_structured(vec![serde_json::json!({
            "@type": "NewsArticle",
            "headline": "Big news",
            "datePublished": "2026-05-10"
        })]);
        let out = to_llm_text(&r, None);
        assert!(
            out.contains("## Structured Data"),
            "NewsArticle dropped: {out}"
        );
        assert!(out.contains("Big news"));
    }
    #[test]
    fn structured_data_drops_oversized_blob() {
        // 32KB pageProps-style blob with no @type — should be dropped.
        let big = "x".repeat(32 * 1024);
        let r = make_result_with_structured(vec![serde_json::json!({
            "buildId": "abc",
            "isFallback": false,
            "noise": big
        })]);
        let out = to_llm_text(&r, None);
        assert!(
            !out.contains("## Structured Data"),
            "Oversized untyped blob leaked: len={}",
            out.len()
        );
    }
    #[test]
    fn structured_data_keeps_compact_untyped() {
        // Small untyped record (e.g. a parsed pageProps with real content) — keep.
        let r = make_result_with_structured(vec![serde_json::json!({
            "title": "Hi",
            "body": "small enough to keep"
        })]);
        let out = to_llm_text(&r, None);
        assert!(
            out.contains("## Structured Data"),
            "Compact untyped dropped: {out}"
        );
    }
    #[test]
    fn structured_data_keeps_compact_untyped_array() {
        // SvelteKit can emit compact arrays rather than objects.
        let r = make_result_with_structured(vec![serde_json::json!([
            { "title": "Hi", "body": "small array item" }
        ])]);
        let out = to_llm_text(&r, None);
        assert!(
            out.contains("small array item"),
            "Compact untyped array dropped: {out}"
        );
    }
 }
--- a/crates/webclaw-core/src/markdown.rs
+++ b/crates/webclaw-core/src/markdown.rs
@ -320,6 +320,9 @@ fn children_to_md(
                }
            }
            Node::Text(text) => {
                if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) {
                    out.push(' ');
                }
                out.push_str(text);
            }
            _ => {}
@ -350,6 +353,9 @@ fn inline_text(
                }
            }
            Node::Text(text) => {
                if !text.is_empty() && !out.is_empty() && needs_separator(&out, text) {
                    out.push(' ');
                }
                out.push_str(text);
            }
            _ => {}
@ -361,11 +367,65 @@ fn inline_text(
 /// Check whether a space is needed between two adjacent chunks of output.
 /// Returns true when the left side doesn't end with whitespace and the right
-/// side doesn't start with whitespace — i.e., two words would be mashed together.
+/// side doesn't start with whitespace, except around punctuation that should
 /// bind to the adjacent token.
 fn needs_separator(left: &str, right: &str) -> bool {
-    let l = left.as_bytes().last().copied().unwrap_or(b' ');
+    let l = left.chars().next_back().unwrap_or(' ');
-    let r = right.as_bytes().first().copied().unwrap_or(b' ');
+    let r = right.chars().next().unwrap_or(' ');
-    !l.is_ascii_whitespace() && !r.is_ascii_whitespace()
+
    if l.is_whitespace() || r.is_whitespace() {
        return false;
    }
    // Do not create "word ," / "word )" / "word 's" artifacts.
    if is_closing_punctuation(r) {
        return false;
    }
    // Do not create "( word" / "[ 1" artifacts.
    if is_opening_punctuation(l) {
        return false;
    }
    // Common inline-code suffixes: `Option`s, `x`'s. Treat them like a
    // single token rather than separating the text node.
    if matches!(l, '`' | ')') && starts_with_inline_code_suffix(right) {
        return false;
    }
    true
 }
 fn starts_with_inline_code_suffix(s: &str) -> bool {
    let trimmed = s.trim_start_matches(['*', '_']);
    let mut chars = trimmed.chars();
    let Some(first) = chars.next() else {
        return false;
    };
    if matches!(first, '\'' | '’') {
        return true;
    }
    if !matches!(first, 's' | 'S') {
        return false;
    }
    match chars.next() {
        None => true,
        Some(c) => c.is_whitespace() || is_closing_punctuation(c) || matches!(c, '*' | '_'),
    }
 }
 fn is_closing_punctuation(c: char) -> bool {
    matches!(
        c,
        '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '}' | '%' | '\'' | '’' | '"' | '”'
    )
 }
 fn is_opening_punctuation(c: char) -> bool {
    matches!(c, '(' | '[' | '{' | '"' | '“')
 }
 /// Collect raw text content (no markdown formatting).
@ -1606,4 +1666,39 @@ mod tests {
            "collapse_whitespace stripped 6-space indent: {output}"
        );
    }
    #[test]
    fn text_after_inline_element_keeps_separator() {
        // Reuters-style markup: <a><time>3h</time>ago</a><a>Tanker crosses...</a>
        // The "ago" text node sits between two element children. Without a
        // separator check on the Text branch, "ago" + "Tanker" would smash
        // together as "agoTanker".
        let html = r#"<div><span>3h</span>ago<span>Tanker crosses Strait</span></div>"#;
        let (md, _, _) = convert_html(html, None);
        assert!(
            !md.contains("agoTanker"),
            "Element->Text->Element smashed together: {md}"
        );
    }
    #[test]
    fn punctuation_after_inline_element_stays_attached() {
        let html = r#"<p><span>Hello</span>, world. Use <code>package.json</code>.</p>"#;
        let (md, _, _) = convert_html(html, None);
        assert!(md.contains("Hello, world"), "punctuation detached: {md}");
        assert!(
            md.contains("`package.json`."),
            "code punctuation detached: {md}"
        );
    }
    #[test]
    fn inline_code_suffix_stays_attached() {
        let html = r#"<p><a href="https://example.com"><code>NullPointerException</code></a><em>s</em> are common.</p>"#;
        let (md, _, _) = convert_html(html, None);
        assert!(
            md.contains("[`NullPointerException`](https://example.com)*s* are common"),
            "code suffix detached: {md}"
        );
    }
 }